commit 74907ed960ca09f20b523502d06d524b586bb32d Author: ModelHub XC Date: Thu Jun 4 23:30:35 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Neelectric/Llama-3.1-8B-Instruct_SFT_mathv00.02_s43 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..80d9c74 --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +datasets: Neelectric/OpenR1-Math-220k_all_Llama3_4096toks +library_name: transformers +model_name: Llama-3.1-8B-Instruct_SFT_mathv00.02_s43 +tags: +- generated_from_trainer +- open-r1 +- sft +- trl +licence: license +--- + +# Model Card for Llama-3.1-8B-Instruct_SFT_mathv00.02_s43 + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the [Neelectric/OpenR1-Math-220k_all_Llama3_4096toks](https://huggingface.co/datasets/Neelectric/OpenR1-Math-220k_all_Llama3_4096toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.1-8B-Instruct_SFT_mathv00.02_s43", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_math/runs/5zdzdstv) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 1.1.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.8.5 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..7899703 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 5.628159003328302e+19, + "train_loss": 0.3662048727583553, + "train_runtime": 42474.8118, + "train_samples": 125770, + "train_samples_per_second": 8.883, + "train_steps_per_second": 0.555 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..e1d9068 --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..1996dc1 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..af3655d --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec7ca7beafe6f4620090655ac953f20774fdb02e66fdb44355881fc0d8e1c933 +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..54b456b --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b84a4d6b0259306c3f0be467f8df74db3dc1180d261ffca7ed12d61fbbe9542 +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..8f3359d --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00eff680b6e75f8f861f2ae6a97bae4ba4c772dd0815c24828876a472dc33fd6 +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..ad528b8 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12731f620f36f8179eff5c14699268cb410b6dd6d1f55e79ee6ccdd9a65264ed +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..e8f05fa --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,10 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..8b0c7c1 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..7899703 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 5.628159003328302e+19, + "train_loss": 0.3662048727583553, + "train_runtime": 42474.8118, + "train_samples": 125770, + "train_samples_per_second": 8.883, + "train_steps_per_second": 0.555 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..2cb77fc --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,212290 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 23583, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00012721027859051011, + "grad_norm": 5.112705707550049, + "learning_rate": 0.0, + "loss": 0.8594, + "mean_token_accuracy": 0.7632832527160645, + "num_tokens": 39170.0, + "step": 1 + }, + { + "epoch": 0.00025442055718102023, + "grad_norm": 4.658906936645508, + "learning_rate": 4.2390843577787196e-10, + "loss": 0.8038, + "mean_token_accuracy": 0.7740107774734497, + "num_tokens": 79650.0, + "step": 2 + }, + { + "epoch": 0.0003816308357715303, + "grad_norm": 4.923088073730469, + "learning_rate": 8.478168715557439e-10, + "loss": 0.7784, + "mean_token_accuracy": 0.782990574836731, + "num_tokens": 116895.0, + "step": 3 + }, + { + "epoch": 0.0005088411143620405, + "grad_norm": 4.920088768005371, + "learning_rate": 1.271725307333616e-09, + "loss": 0.8271, + "mean_token_accuracy": 0.7668754458427429, + "num_tokens": 154216.0, + "step": 4 + }, + { + "epoch": 0.0006360513929525506, + "grad_norm": 4.644379615783691, + "learning_rate": 1.6956337431114878e-09, + "loss": 0.7281, + "mean_token_accuracy": 0.7935225963592529, + "num_tokens": 192509.0, + "step": 5 + }, + { + "epoch": 0.0007632616715430606, + "grad_norm": 4.793134689331055, + "learning_rate": 2.1195421788893596e-09, + "loss": 0.7787, + "mean_token_accuracy": 0.7838712930679321, + "num_tokens": 228902.0, + "step": 6 + }, + { + "epoch": 0.0008904719501335708, + "grad_norm": 4.288508415222168, + "learning_rate": 2.543450614667232e-09, + "loss": 0.7833, + "mean_token_accuracy": 0.7799034118652344, + "num_tokens": 272297.0, + "step": 7 + }, + { + "epoch": 0.001017682228724081, + "grad_norm": 4.992496013641357, + "learning_rate": 2.967359050445104e-09, + "loss": 0.797, + "mean_token_accuracy": 0.778503954410553, + "num_tokens": 308129.0, + "step": 8 + }, + { + "epoch": 0.001144892507314591, + "grad_norm": 4.694489479064941, + "learning_rate": 3.3912674862229757e-09, + "loss": 0.809, + "mean_token_accuracy": 0.772209882736206, + "num_tokens": 348780.0, + "step": 9 + }, + { + "epoch": 0.0012721027859051012, + "grad_norm": 4.848956108093262, + "learning_rate": 3.815175922000847e-09, + "loss": 0.7982, + "mean_token_accuracy": 0.7799160480499268, + "num_tokens": 385881.0, + "step": 10 + }, + { + "epoch": 0.0013993130644956112, + "grad_norm": 5.381214141845703, + "learning_rate": 4.239084357778719e-09, + "loss": 0.8739, + "mean_token_accuracy": 0.7612254619598389, + "num_tokens": 419746.0, + "step": 11 + }, + { + "epoch": 0.0015265233430861213, + "grad_norm": 5.15503454208374, + "learning_rate": 4.662992793556591e-09, + "loss": 0.8361, + "mean_token_accuracy": 0.7703356742858887, + "num_tokens": 452147.0, + "step": 12 + }, + { + "epoch": 0.0016537336216766315, + "grad_norm": 5.12066650390625, + "learning_rate": 5.086901229334464e-09, + "loss": 0.8533, + "mean_token_accuracy": 0.7634961009025574, + "num_tokens": 490241.0, + "step": 13 + }, + { + "epoch": 0.0017809439002671415, + "grad_norm": 4.544832229614258, + "learning_rate": 5.510809665112336e-09, + "loss": 0.7438, + "mean_token_accuracy": 0.7934238910675049, + "num_tokens": 531023.0, + "step": 14 + }, + { + "epoch": 0.0019081541788576518, + "grad_norm": 4.599610328674316, + "learning_rate": 5.934718100890208e-09, + "loss": 0.7986, + "mean_token_accuracy": 0.7752445936203003, + "num_tokens": 573152.0, + "step": 15 + }, + { + "epoch": 0.002035364457448162, + "grad_norm": 4.7829999923706055, + "learning_rate": 6.3586265366680796e-09, + "loss": 0.8841, + "mean_token_accuracy": 0.7514467239379883, + "num_tokens": 615157.0, + "step": 16 + }, + { + "epoch": 0.002162574736038672, + "grad_norm": 5.63701057434082, + "learning_rate": 6.782534972445951e-09, + "loss": 0.8236, + "mean_token_accuracy": 0.7707278728485107, + "num_tokens": 645632.0, + "step": 17 + }, + { + "epoch": 0.002289785014629182, + "grad_norm": 4.598687171936035, + "learning_rate": 7.206443408223823e-09, + "loss": 0.8581, + "mean_token_accuracy": 0.7660191059112549, + "num_tokens": 688798.0, + "step": 18 + }, + { + "epoch": 0.0024169952932196924, + "grad_norm": 5.11918830871582, + "learning_rate": 7.630351844001695e-09, + "loss": 0.8155, + "mean_token_accuracy": 0.7730510234832764, + "num_tokens": 722451.0, + "step": 19 + }, + { + "epoch": 0.0025442055718102024, + "grad_norm": 4.773462772369385, + "learning_rate": 8.054260279779567e-09, + "loss": 0.8643, + "mean_token_accuracy": 0.7601308226585388, + "num_tokens": 761569.0, + "step": 20 + }, + { + "epoch": 0.0026714158504007124, + "grad_norm": 4.538682460784912, + "learning_rate": 8.478168715557438e-09, + "loss": 0.807, + "mean_token_accuracy": 0.7776646018028259, + "num_tokens": 803477.0, + "step": 21 + }, + { + "epoch": 0.0027986261289912225, + "grad_norm": 4.730398178100586, + "learning_rate": 8.902077151335311e-09, + "loss": 0.7544, + "mean_token_accuracy": 0.7887319922447205, + "num_tokens": 839552.0, + "step": 22 + }, + { + "epoch": 0.0029258364075817325, + "grad_norm": 5.054384708404541, + "learning_rate": 9.325985587113182e-09, + "loss": 0.78, + "mean_token_accuracy": 0.7804571390151978, + "num_tokens": 874137.0, + "step": 23 + }, + { + "epoch": 0.0030530466861722425, + "grad_norm": 5.026080131530762, + "learning_rate": 9.749894022891054e-09, + "loss": 0.8125, + "mean_token_accuracy": 0.7738168239593506, + "num_tokens": 909950.0, + "step": 24 + }, + { + "epoch": 0.003180256964762753, + "grad_norm": 4.683595657348633, + "learning_rate": 1.0173802458668929e-08, + "loss": 0.7973, + "mean_token_accuracy": 0.7795525193214417, + "num_tokens": 948927.0, + "step": 25 + }, + { + "epoch": 0.003307467243353263, + "grad_norm": 4.609133720397949, + "learning_rate": 1.05977108944468e-08, + "loss": 0.7634, + "mean_token_accuracy": 0.786624014377594, + "num_tokens": 987077.0, + "step": 26 + }, + { + "epoch": 0.003434677521943773, + "grad_norm": 5.076290607452393, + "learning_rate": 1.1021619330224672e-08, + "loss": 0.8464, + "mean_token_accuracy": 0.7648380398750305, + "num_tokens": 1023947.0, + "step": 27 + }, + { + "epoch": 0.003561887800534283, + "grad_norm": 4.74932336807251, + "learning_rate": 1.1445527766002543e-08, + "loss": 0.7799, + "mean_token_accuracy": 0.784994900226593, + "num_tokens": 1060700.0, + "step": 28 + }, + { + "epoch": 0.003689098079124793, + "grad_norm": 4.610493183135986, + "learning_rate": 1.1869436201780416e-08, + "loss": 0.7679, + "mean_token_accuracy": 0.7798852920532227, + "num_tokens": 1099464.0, + "step": 29 + }, + { + "epoch": 0.0038163083577153036, + "grad_norm": 4.197378158569336, + "learning_rate": 1.2293344637558287e-08, + "loss": 0.7609, + "mean_token_accuracy": 0.7842599153518677, + "num_tokens": 1146534.0, + "step": 30 + }, + { + "epoch": 0.003943518636305814, + "grad_norm": 5.008591651916504, + "learning_rate": 1.2717253073336159e-08, + "loss": 0.8428, + "mean_token_accuracy": 0.7617281675338745, + "num_tokens": 1184539.0, + "step": 31 + }, + { + "epoch": 0.004070728914896324, + "grad_norm": 4.891098976135254, + "learning_rate": 1.314116150911403e-08, + "loss": 0.8254, + "mean_token_accuracy": 0.7712481021881104, + "num_tokens": 1224478.0, + "step": 32 + }, + { + "epoch": 0.004197939193486834, + "grad_norm": 4.423603534698486, + "learning_rate": 1.3565069944891903e-08, + "loss": 0.7652, + "mean_token_accuracy": 0.783380925655365, + "num_tokens": 1266135.0, + "step": 33 + }, + { + "epoch": 0.004325149472077344, + "grad_norm": 4.745572090148926, + "learning_rate": 1.3988978380669775e-08, + "loss": 0.8151, + "mean_token_accuracy": 0.7736899256706238, + "num_tokens": 1306494.0, + "step": 34 + }, + { + "epoch": 0.004452359750667854, + "grad_norm": 5.026683330535889, + "learning_rate": 1.4412886816447646e-08, + "loss": 0.827, + "mean_token_accuracy": 0.7714340686798096, + "num_tokens": 1341861.0, + "step": 35 + }, + { + "epoch": 0.004579570029258364, + "grad_norm": 5.143521785736084, + "learning_rate": 1.4836795252225519e-08, + "loss": 0.7848, + "mean_token_accuracy": 0.7848769426345825, + "num_tokens": 1373282.0, + "step": 36 + }, + { + "epoch": 0.004706780307848874, + "grad_norm": 5.01332426071167, + "learning_rate": 1.526070368800339e-08, + "loss": 0.8244, + "mean_token_accuracy": 0.7723448276519775, + "num_tokens": 1410197.0, + "step": 37 + }, + { + "epoch": 0.004833990586439385, + "grad_norm": 4.624547958374023, + "learning_rate": 1.5684612123781262e-08, + "loss": 0.7761, + "mean_token_accuracy": 0.7799388766288757, + "num_tokens": 1451151.0, + "step": 38 + }, + { + "epoch": 0.004961200865029895, + "grad_norm": 5.403928279876709, + "learning_rate": 1.6108520559559135e-08, + "loss": 0.8073, + "mean_token_accuracy": 0.7720504999160767, + "num_tokens": 1482491.0, + "step": 39 + }, + { + "epoch": 0.005088411143620405, + "grad_norm": 4.711781978607178, + "learning_rate": 1.6532428995337004e-08, + "loss": 0.812, + "mean_token_accuracy": 0.7767707109451294, + "num_tokens": 1518599.0, + "step": 40 + }, + { + "epoch": 0.005215621422210915, + "grad_norm": 4.51467752456665, + "learning_rate": 1.6956337431114877e-08, + "loss": 0.8342, + "mean_token_accuracy": 0.7718529105186462, + "num_tokens": 1559022.0, + "step": 41 + }, + { + "epoch": 0.005342831700801425, + "grad_norm": 4.670768737792969, + "learning_rate": 1.738024586689275e-08, + "loss": 0.8228, + "mean_token_accuracy": 0.7739574909210205, + "num_tokens": 1598559.0, + "step": 42 + }, + { + "epoch": 0.005470041979391935, + "grad_norm": 4.874985694885254, + "learning_rate": 1.7804154302670622e-08, + "loss": 0.8263, + "mean_token_accuracy": 0.7688673138618469, + "num_tokens": 1638212.0, + "step": 43 + }, + { + "epoch": 0.005597252257982445, + "grad_norm": 4.720326900482178, + "learning_rate": 1.8228062738448494e-08, + "loss": 0.7984, + "mean_token_accuracy": 0.7772597074508667, + "num_tokens": 1679236.0, + "step": 44 + }, + { + "epoch": 0.005724462536572955, + "grad_norm": 4.90848445892334, + "learning_rate": 1.8651971174226364e-08, + "loss": 0.8018, + "mean_token_accuracy": 0.776411771774292, + "num_tokens": 1717438.0, + "step": 45 + }, + { + "epoch": 0.005851672815163465, + "grad_norm": 5.148610591888428, + "learning_rate": 1.9075879610004236e-08, + "loss": 0.8209, + "mean_token_accuracy": 0.7697130441665649, + "num_tokens": 1750364.0, + "step": 46 + }, + { + "epoch": 0.005978883093753975, + "grad_norm": 4.899515628814697, + "learning_rate": 1.949978804578211e-08, + "loss": 0.8707, + "mean_token_accuracy": 0.7662723064422607, + "num_tokens": 1789042.0, + "step": 47 + }, + { + "epoch": 0.006106093372344485, + "grad_norm": 5.34881067276001, + "learning_rate": 1.9923696481559985e-08, + "loss": 0.8534, + "mean_token_accuracy": 0.7623584866523743, + "num_tokens": 1823772.0, + "step": 48 + }, + { + "epoch": 0.006233303650934996, + "grad_norm": 4.64490270614624, + "learning_rate": 2.0347604917337857e-08, + "loss": 0.8353, + "mean_token_accuracy": 0.766684889793396, + "num_tokens": 1867281.0, + "step": 49 + }, + { + "epoch": 0.006360513929525506, + "grad_norm": 4.6883745193481445, + "learning_rate": 2.0771513353115727e-08, + "loss": 0.7672, + "mean_token_accuracy": 0.7847535014152527, + "num_tokens": 1906609.0, + "step": 50 + }, + { + "epoch": 0.006487724208116016, + "grad_norm": 4.401918411254883, + "learning_rate": 2.11954217888936e-08, + "loss": 0.7366, + "mean_token_accuracy": 0.7959910035133362, + "num_tokens": 1946715.0, + "step": 51 + }, + { + "epoch": 0.006614934486706526, + "grad_norm": 4.657690525054932, + "learning_rate": 2.1619330224671472e-08, + "loss": 0.7956, + "mean_token_accuracy": 0.7769267559051514, + "num_tokens": 1985337.0, + "step": 52 + }, + { + "epoch": 0.006742144765297036, + "grad_norm": 5.108847141265869, + "learning_rate": 2.2043238660449344e-08, + "loss": 0.7788, + "mean_token_accuracy": 0.7802899479866028, + "num_tokens": 2017464.0, + "step": 53 + }, + { + "epoch": 0.006869355043887546, + "grad_norm": 4.69527530670166, + "learning_rate": 2.2467147096227214e-08, + "loss": 0.7834, + "mean_token_accuracy": 0.7818881273269653, + "num_tokens": 2056793.0, + "step": 54 + }, + { + "epoch": 0.006996565322478056, + "grad_norm": 4.736614227294922, + "learning_rate": 2.2891055532005086e-08, + "loss": 0.7941, + "mean_token_accuracy": 0.7779852151870728, + "num_tokens": 2093745.0, + "step": 55 + }, + { + "epoch": 0.007123775601068566, + "grad_norm": 4.803955078125, + "learning_rate": 2.331496396778296e-08, + "loss": 0.8185, + "mean_token_accuracy": 0.7758020162582397, + "num_tokens": 2134168.0, + "step": 56 + }, + { + "epoch": 0.007250985879659076, + "grad_norm": 4.341311931610107, + "learning_rate": 2.373887240356083e-08, + "loss": 0.812, + "mean_token_accuracy": 0.7731237411499023, + "num_tokens": 2178105.0, + "step": 57 + }, + { + "epoch": 0.007378196158249586, + "grad_norm": 4.7604780197143555, + "learning_rate": 2.4162780839338704e-08, + "loss": 0.8095, + "mean_token_accuracy": 0.7752113342285156, + "num_tokens": 2215312.0, + "step": 58 + }, + { + "epoch": 0.007505406436840096, + "grad_norm": 4.770262241363525, + "learning_rate": 2.4586689275116573e-08, + "loss": 0.7915, + "mean_token_accuracy": 0.7841527462005615, + "num_tokens": 2252517.0, + "step": 59 + }, + { + "epoch": 0.007632616715430607, + "grad_norm": 4.913504123687744, + "learning_rate": 2.5010597710894446e-08, + "loss": 0.8165, + "mean_token_accuracy": 0.7736396789550781, + "num_tokens": 2287396.0, + "step": 60 + }, + { + "epoch": 0.007759826994021117, + "grad_norm": 5.079829216003418, + "learning_rate": 2.5434506146672318e-08, + "loss": 0.8415, + "mean_token_accuracy": 0.766096830368042, + "num_tokens": 2321104.0, + "step": 61 + }, + { + "epoch": 0.007887037272611627, + "grad_norm": 4.938776969909668, + "learning_rate": 2.585841458245019e-08, + "loss": 0.7977, + "mean_token_accuracy": 0.7725139856338501, + "num_tokens": 2358154.0, + "step": 62 + }, + { + "epoch": 0.008014247551202136, + "grad_norm": 4.807287216186523, + "learning_rate": 2.628232301822806e-08, + "loss": 0.8187, + "mean_token_accuracy": 0.7703363299369812, + "num_tokens": 2394772.0, + "step": 63 + }, + { + "epoch": 0.008141457829792647, + "grad_norm": 4.510971546173096, + "learning_rate": 2.6706231454005933e-08, + "loss": 0.7171, + "mean_token_accuracy": 0.801332414150238, + "num_tokens": 2433300.0, + "step": 64 + }, + { + "epoch": 0.008268668108383158, + "grad_norm": 4.558603286743164, + "learning_rate": 2.7130139889783805e-08, + "loss": 0.8062, + "mean_token_accuracy": 0.7737507820129395, + "num_tokens": 2475070.0, + "step": 65 + }, + { + "epoch": 0.008395878386973667, + "grad_norm": 4.607460975646973, + "learning_rate": 2.7554048325561678e-08, + "loss": 0.7765, + "mean_token_accuracy": 0.7837051153182983, + "num_tokens": 2513688.0, + "step": 66 + }, + { + "epoch": 0.008523088665564178, + "grad_norm": 4.708522796630859, + "learning_rate": 2.797795676133955e-08, + "loss": 0.7761, + "mean_token_accuracy": 0.7835496664047241, + "num_tokens": 2549423.0, + "step": 67 + }, + { + "epoch": 0.008650298944154687, + "grad_norm": 4.7121734619140625, + "learning_rate": 2.840186519711742e-08, + "loss": 0.8192, + "mean_token_accuracy": 0.7677602767944336, + "num_tokens": 2587865.0, + "step": 68 + }, + { + "epoch": 0.008777509222745198, + "grad_norm": 4.74582576751709, + "learning_rate": 2.8825773632895292e-08, + "loss": 0.8056, + "mean_token_accuracy": 0.7729679346084595, + "num_tokens": 2623591.0, + "step": 69 + }, + { + "epoch": 0.008904719501335707, + "grad_norm": 4.538874626159668, + "learning_rate": 2.9249682068673165e-08, + "loss": 0.7334, + "mean_token_accuracy": 0.7900732159614563, + "num_tokens": 2661403.0, + "step": 70 + }, + { + "epoch": 0.009031929779926218, + "grad_norm": 4.486273288726807, + "learning_rate": 2.9673590504451037e-08, + "loss": 0.7947, + "mean_token_accuracy": 0.7748939990997314, + "num_tokens": 2704442.0, + "step": 71 + }, + { + "epoch": 0.009159140058516728, + "grad_norm": 4.818334579467773, + "learning_rate": 3.0097498940228907e-08, + "loss": 0.8751, + "mean_token_accuracy": 0.7530067563056946, + "num_tokens": 2739423.0, + "step": 72 + }, + { + "epoch": 0.009286350337107238, + "grad_norm": 4.707066535949707, + "learning_rate": 3.052140737600678e-08, + "loss": 0.7925, + "mean_token_accuracy": 0.7799849510192871, + "num_tokens": 2775800.0, + "step": 73 + }, + { + "epoch": 0.009413560615697748, + "grad_norm": 4.154481887817383, + "learning_rate": 3.094531581178465e-08, + "loss": 0.7136, + "mean_token_accuracy": 0.7989601492881775, + "num_tokens": 2817394.0, + "step": 74 + }, + { + "epoch": 0.009540770894288259, + "grad_norm": 4.713927268981934, + "learning_rate": 3.1369224247562524e-08, + "loss": 0.8607, + "mean_token_accuracy": 0.7595996260643005, + "num_tokens": 2854140.0, + "step": 75 + }, + { + "epoch": 0.00966798117287877, + "grad_norm": 4.306637763977051, + "learning_rate": 3.17931326833404e-08, + "loss": 0.7413, + "mean_token_accuracy": 0.788774847984314, + "num_tokens": 2894492.0, + "step": 76 + }, + { + "epoch": 0.009795191451469279, + "grad_norm": 4.795387268066406, + "learning_rate": 3.221704111911827e-08, + "loss": 0.8081, + "mean_token_accuracy": 0.7765604257583618, + "num_tokens": 2929726.0, + "step": 77 + }, + { + "epoch": 0.00992240173005979, + "grad_norm": 4.553252696990967, + "learning_rate": 3.264094955489614e-08, + "loss": 0.8908, + "mean_token_accuracy": 0.7508599758148193, + "num_tokens": 2972039.0, + "step": 78 + }, + { + "epoch": 0.010049612008650299, + "grad_norm": 4.52592658996582, + "learning_rate": 3.306485799067401e-08, + "loss": 0.7977, + "mean_token_accuracy": 0.7748531103134155, + "num_tokens": 3011924.0, + "step": 79 + }, + { + "epoch": 0.01017682228724081, + "grad_norm": 4.545559883117676, + "learning_rate": 3.348876642645188e-08, + "loss": 0.8062, + "mean_token_accuracy": 0.7760732173919678, + "num_tokens": 3049786.0, + "step": 80 + }, + { + "epoch": 0.010304032565831319, + "grad_norm": 4.3373589515686035, + "learning_rate": 3.391267486222975e-08, + "loss": 0.8161, + "mean_token_accuracy": 0.7713030576705933, + "num_tokens": 3092655.0, + "step": 81 + }, + { + "epoch": 0.01043124284442183, + "grad_norm": 4.566313743591309, + "learning_rate": 3.4336583298007626e-08, + "loss": 0.8412, + "mean_token_accuracy": 0.7655562162399292, + "num_tokens": 3130378.0, + "step": 82 + }, + { + "epoch": 0.010558453123012339, + "grad_norm": 4.160683631896973, + "learning_rate": 3.47604917337855e-08, + "loss": 0.7795, + "mean_token_accuracy": 0.7784881591796875, + "num_tokens": 3177027.0, + "step": 83 + }, + { + "epoch": 0.01068566340160285, + "grad_norm": 4.602756500244141, + "learning_rate": 3.518440016956337e-08, + "loss": 0.8364, + "mean_token_accuracy": 0.7645465731620789, + "num_tokens": 3217500.0, + "step": 84 + }, + { + "epoch": 0.010812873680193359, + "grad_norm": 5.0481414794921875, + "learning_rate": 3.5608308605341244e-08, + "loss": 0.7983, + "mean_token_accuracy": 0.7750927209854126, + "num_tokens": 3250642.0, + "step": 85 + }, + { + "epoch": 0.01094008395878387, + "grad_norm": 4.373542785644531, + "learning_rate": 3.6032217041119116e-08, + "loss": 0.7854, + "mean_token_accuracy": 0.7844154834747314, + "num_tokens": 3288568.0, + "step": 86 + }, + { + "epoch": 0.01106729423737438, + "grad_norm": 4.638592720031738, + "learning_rate": 3.645612547689699e-08, + "loss": 0.8065, + "mean_token_accuracy": 0.7733927965164185, + "num_tokens": 3327948.0, + "step": 87 + }, + { + "epoch": 0.01119450451596489, + "grad_norm": 4.595850467681885, + "learning_rate": 3.6880033912674855e-08, + "loss": 0.7385, + "mean_token_accuracy": 0.7945128083229065, + "num_tokens": 3365544.0, + "step": 88 + }, + { + "epoch": 0.0113217147945554, + "grad_norm": 4.4719319343566895, + "learning_rate": 3.730394234845273e-08, + "loss": 0.8224, + "mean_token_accuracy": 0.7674025893211365, + "num_tokens": 3405278.0, + "step": 89 + }, + { + "epoch": 0.01144892507314591, + "grad_norm": 4.612032890319824, + "learning_rate": 3.77278507842306e-08, + "loss": 0.8149, + "mean_token_accuracy": 0.7725558876991272, + "num_tokens": 3442356.0, + "step": 90 + }, + { + "epoch": 0.01157613535173642, + "grad_norm": 4.228574275970459, + "learning_rate": 3.815175922000847e-08, + "loss": 0.7859, + "mean_token_accuracy": 0.7792248725891113, + "num_tokens": 3487407.0, + "step": 91 + }, + { + "epoch": 0.01170334563032693, + "grad_norm": 4.882123947143555, + "learning_rate": 3.8575667655786345e-08, + "loss": 0.7938, + "mean_token_accuracy": 0.7785038948059082, + "num_tokens": 3520838.0, + "step": 92 + }, + { + "epoch": 0.01183055590891744, + "grad_norm": 4.732731342315674, + "learning_rate": 3.899957609156422e-08, + "loss": 0.7612, + "mean_token_accuracy": 0.7876899838447571, + "num_tokens": 3555300.0, + "step": 93 + }, + { + "epoch": 0.01195776618750795, + "grad_norm": 5.148312091827393, + "learning_rate": 3.94234845273421e-08, + "loss": 0.8334, + "mean_token_accuracy": 0.7663031220436096, + "num_tokens": 3587307.0, + "step": 94 + }, + { + "epoch": 0.012084976466098461, + "grad_norm": 4.64989709854126, + "learning_rate": 3.984739296311997e-08, + "loss": 0.8109, + "mean_token_accuracy": 0.7723174691200256, + "num_tokens": 3623953.0, + "step": 95 + }, + { + "epoch": 0.01221218674468897, + "grad_norm": 4.734171390533447, + "learning_rate": 4.027130139889784e-08, + "loss": 0.8033, + "mean_token_accuracy": 0.7759045362472534, + "num_tokens": 3658423.0, + "step": 96 + }, + { + "epoch": 0.012339397023279481, + "grad_norm": 4.498534202575684, + "learning_rate": 4.0695209834675715e-08, + "loss": 0.7644, + "mean_token_accuracy": 0.7862260341644287, + "num_tokens": 3695704.0, + "step": 97 + }, + { + "epoch": 0.012466607301869992, + "grad_norm": 4.357119560241699, + "learning_rate": 4.111911827045358e-08, + "loss": 0.7866, + "mean_token_accuracy": 0.778313398361206, + "num_tokens": 3738881.0, + "step": 98 + }, + { + "epoch": 0.012593817580460501, + "grad_norm": 4.377371311187744, + "learning_rate": 4.154302670623145e-08, + "loss": 0.7917, + "mean_token_accuracy": 0.7789276838302612, + "num_tokens": 3778465.0, + "step": 99 + }, + { + "epoch": 0.012721027859051012, + "grad_norm": 4.450660228729248, + "learning_rate": 4.1966935142009326e-08, + "loss": 0.8056, + "mean_token_accuracy": 0.77694171667099, + "num_tokens": 3815904.0, + "step": 100 + }, + { + "epoch": 0.012848238137641521, + "grad_norm": 4.143291473388672, + "learning_rate": 4.23908435777872e-08, + "loss": 0.7309, + "mean_token_accuracy": 0.794201135635376, + "num_tokens": 3856586.0, + "step": 101 + }, + { + "epoch": 0.012975448416232032, + "grad_norm": 4.34871768951416, + "learning_rate": 4.281475201356507e-08, + "loss": 0.8043, + "mean_token_accuracy": 0.7731078863143921, + "num_tokens": 3894979.0, + "step": 102 + }, + { + "epoch": 0.013102658694822541, + "grad_norm": 4.361048221588135, + "learning_rate": 4.3238660449342943e-08, + "loss": 0.8091, + "mean_token_accuracy": 0.7744704484939575, + "num_tokens": 3934400.0, + "step": 103 + }, + { + "epoch": 0.013229868973413052, + "grad_norm": 3.995098114013672, + "learning_rate": 4.3662568885120816e-08, + "loss": 0.7665, + "mean_token_accuracy": 0.7788559794425964, + "num_tokens": 3977142.0, + "step": 104 + }, + { + "epoch": 0.013357079252003561, + "grad_norm": 4.124961853027344, + "learning_rate": 4.408647732089869e-08, + "loss": 0.7975, + "mean_token_accuracy": 0.776371419429779, + "num_tokens": 4017561.0, + "step": 105 + }, + { + "epoch": 0.013484289530594072, + "grad_norm": 4.226240158081055, + "learning_rate": 4.451038575667656e-08, + "loss": 0.8088, + "mean_token_accuracy": 0.7758969068527222, + "num_tokens": 4056797.0, + "step": 106 + }, + { + "epoch": 0.013611499809184581, + "grad_norm": 4.322922229766846, + "learning_rate": 4.493429419245443e-08, + "loss": 0.7579, + "mean_token_accuracy": 0.7892772555351257, + "num_tokens": 4092487.0, + "step": 107 + }, + { + "epoch": 0.013738710087775092, + "grad_norm": 4.267516136169434, + "learning_rate": 4.53582026282323e-08, + "loss": 0.7696, + "mean_token_accuracy": 0.7850325107574463, + "num_tokens": 4130772.0, + "step": 108 + }, + { + "epoch": 0.013865920366365603, + "grad_norm": 4.322446823120117, + "learning_rate": 4.578211106401017e-08, + "loss": 0.8079, + "mean_token_accuracy": 0.768169641494751, + "num_tokens": 4167818.0, + "step": 109 + }, + { + "epoch": 0.013993130644956112, + "grad_norm": 4.275991439819336, + "learning_rate": 4.6206019499788045e-08, + "loss": 0.7432, + "mean_token_accuracy": 0.7865269780158997, + "num_tokens": 4204814.0, + "step": 110 + }, + { + "epoch": 0.014120340923546623, + "grad_norm": 4.289038181304932, + "learning_rate": 4.662992793556592e-08, + "loss": 0.7694, + "mean_token_accuracy": 0.774307906627655, + "num_tokens": 4241461.0, + "step": 111 + }, + { + "epoch": 0.014247551202137132, + "grad_norm": 3.941042900085449, + "learning_rate": 4.705383637134379e-08, + "loss": 0.7684, + "mean_token_accuracy": 0.7810178995132446, + "num_tokens": 4284803.0, + "step": 112 + }, + { + "epoch": 0.014374761480727643, + "grad_norm": 4.449498653411865, + "learning_rate": 4.747774480712166e-08, + "loss": 0.8114, + "mean_token_accuracy": 0.7701939940452576, + "num_tokens": 4323921.0, + "step": 113 + }, + { + "epoch": 0.014501971759318152, + "grad_norm": 4.2469258308410645, + "learning_rate": 4.7901653242899535e-08, + "loss": 0.7287, + "mean_token_accuracy": 0.7906850576400757, + "num_tokens": 4363137.0, + "step": 114 + }, + { + "epoch": 0.014629182037908663, + "grad_norm": 4.352765083312988, + "learning_rate": 4.832556167867741e-08, + "loss": 0.7848, + "mean_token_accuracy": 0.7760191559791565, + "num_tokens": 4400586.0, + "step": 115 + }, + { + "epoch": 0.014756392316499172, + "grad_norm": 4.410439491271973, + "learning_rate": 4.8749470114455274e-08, + "loss": 0.7921, + "mean_token_accuracy": 0.7778503894805908, + "num_tokens": 4436747.0, + "step": 116 + }, + { + "epoch": 0.014883602595089683, + "grad_norm": 4.273299694061279, + "learning_rate": 4.9173378550233146e-08, + "loss": 0.7886, + "mean_token_accuracy": 0.7825117111206055, + "num_tokens": 4473810.0, + "step": 117 + }, + { + "epoch": 0.015010812873680193, + "grad_norm": 4.097933769226074, + "learning_rate": 4.959728698601102e-08, + "loss": 0.6844, + "mean_token_accuracy": 0.8032032251358032, + "num_tokens": 4510078.0, + "step": 118 + }, + { + "epoch": 0.015138023152270703, + "grad_norm": 4.386758804321289, + "learning_rate": 5.002119542178889e-08, + "loss": 0.7744, + "mean_token_accuracy": 0.7823208570480347, + "num_tokens": 4545297.0, + "step": 119 + }, + { + "epoch": 0.015265233430861214, + "grad_norm": 4.169480800628662, + "learning_rate": 5.0445103857566764e-08, + "loss": 0.7965, + "mean_token_accuracy": 0.7747797966003418, + "num_tokens": 4584245.0, + "step": 120 + }, + { + "epoch": 0.015392443709451724, + "grad_norm": 4.3669562339782715, + "learning_rate": 5.0869012293344637e-08, + "loss": 0.7631, + "mean_token_accuracy": 0.7797352075576782, + "num_tokens": 4618901.0, + "step": 121 + }, + { + "epoch": 0.015519653988042234, + "grad_norm": 4.393669605255127, + "learning_rate": 5.129292072912251e-08, + "loss": 0.853, + "mean_token_accuracy": 0.7574356198310852, + "num_tokens": 4656399.0, + "step": 122 + }, + { + "epoch": 0.015646864266632744, + "grad_norm": 4.217415809631348, + "learning_rate": 5.171682916490038e-08, + "loss": 0.7749, + "mean_token_accuracy": 0.7829242944717407, + "num_tokens": 4692918.0, + "step": 123 + }, + { + "epoch": 0.015774074545223254, + "grad_norm": 4.362590312957764, + "learning_rate": 5.2140737600678254e-08, + "loss": 0.745, + "mean_token_accuracy": 0.7849986553192139, + "num_tokens": 4728938.0, + "step": 124 + }, + { + "epoch": 0.015901284823813765, + "grad_norm": 4.25604772567749, + "learning_rate": 5.256464603645612e-08, + "loss": 0.7902, + "mean_token_accuracy": 0.7774913311004639, + "num_tokens": 4767227.0, + "step": 125 + }, + { + "epoch": 0.016028495102404273, + "grad_norm": 4.492059707641602, + "learning_rate": 5.298855447223399e-08, + "loss": 0.7602, + "mean_token_accuracy": 0.7863538861274719, + "num_tokens": 4800458.0, + "step": 126 + }, + { + "epoch": 0.016155705380994784, + "grad_norm": 4.036756992340088, + "learning_rate": 5.3412462908011865e-08, + "loss": 0.7693, + "mean_token_accuracy": 0.7795374393463135, + "num_tokens": 4840976.0, + "step": 127 + }, + { + "epoch": 0.016282915659585295, + "grad_norm": 4.528698444366455, + "learning_rate": 5.383637134378974e-08, + "loss": 0.7919, + "mean_token_accuracy": 0.7758148908615112, + "num_tokens": 4877132.0, + "step": 128 + }, + { + "epoch": 0.016410125938175806, + "grad_norm": 4.373518466949463, + "learning_rate": 5.426027977956761e-08, + "loss": 0.7624, + "mean_token_accuracy": 0.78534334897995, + "num_tokens": 4914477.0, + "step": 129 + }, + { + "epoch": 0.016537336216766316, + "grad_norm": 4.683117389678955, + "learning_rate": 5.468418821534548e-08, + "loss": 0.7153, + "mean_token_accuracy": 0.7936409115791321, + "num_tokens": 4946850.0, + "step": 130 + }, + { + "epoch": 0.016664546495356824, + "grad_norm": 4.095667839050293, + "learning_rate": 5.5108096651123356e-08, + "loss": 0.7376, + "mean_token_accuracy": 0.7913708686828613, + "num_tokens": 4986922.0, + "step": 131 + }, + { + "epoch": 0.016791756773947335, + "grad_norm": 4.097377300262451, + "learning_rate": 5.553200508690123e-08, + "loss": 0.7675, + "mean_token_accuracy": 0.7804635763168335, + "num_tokens": 5027951.0, + "step": 132 + }, + { + "epoch": 0.016918967052537846, + "grad_norm": 3.697374105453491, + "learning_rate": 5.59559135226791e-08, + "loss": 0.6755, + "mean_token_accuracy": 0.8057949542999268, + "num_tokens": 5069167.0, + "step": 133 + }, + { + "epoch": 0.017046177331128357, + "grad_norm": 4.405448913574219, + "learning_rate": 5.637982195845697e-08, + "loss": 0.7655, + "mean_token_accuracy": 0.7823858261108398, + "num_tokens": 5104247.0, + "step": 134 + }, + { + "epoch": 0.017173387609718864, + "grad_norm": 4.707314491271973, + "learning_rate": 5.680373039423484e-08, + "loss": 0.788, + "mean_token_accuracy": 0.7790038585662842, + "num_tokens": 5138563.0, + "step": 135 + }, + { + "epoch": 0.017300597888309375, + "grad_norm": 4.155884742736816, + "learning_rate": 5.722763883001271e-08, + "loss": 0.7776, + "mean_token_accuracy": 0.7755191922187805, + "num_tokens": 5175329.0, + "step": 136 + }, + { + "epoch": 0.017427808166899886, + "grad_norm": 4.054533004760742, + "learning_rate": 5.7651547265790585e-08, + "loss": 0.821, + "mean_token_accuracy": 0.7646670341491699, + "num_tokens": 5215965.0, + "step": 137 + }, + { + "epoch": 0.017555018445490397, + "grad_norm": 4.436336040496826, + "learning_rate": 5.807545570156846e-08, + "loss": 0.7549, + "mean_token_accuracy": 0.7862895727157593, + "num_tokens": 5249119.0, + "step": 138 + }, + { + "epoch": 0.017682228724080904, + "grad_norm": 4.3403425216674805, + "learning_rate": 5.849936413734633e-08, + "loss": 0.7834, + "mean_token_accuracy": 0.7730858325958252, + "num_tokens": 5282719.0, + "step": 139 + }, + { + "epoch": 0.017809439002671415, + "grad_norm": 4.160698890686035, + "learning_rate": 5.89232725731242e-08, + "loss": 0.7858, + "mean_token_accuracy": 0.7759907245635986, + "num_tokens": 5322764.0, + "step": 140 + }, + { + "epoch": 0.017936649281261926, + "grad_norm": 4.521846294403076, + "learning_rate": 5.9347181008902075e-08, + "loss": 0.7698, + "mean_token_accuracy": 0.7765457630157471, + "num_tokens": 5351964.0, + "step": 141 + }, + { + "epoch": 0.018063859559852437, + "grad_norm": 3.5716850757598877, + "learning_rate": 5.977108944467995e-08, + "loss": 0.7249, + "mean_token_accuracy": 0.791903018951416, + "num_tokens": 5397258.0, + "step": 142 + }, + { + "epoch": 0.018191069838442948, + "grad_norm": 4.073146343231201, + "learning_rate": 6.019499788045781e-08, + "loss": 0.8423, + "mean_token_accuracy": 0.7573860883712769, + "num_tokens": 5434262.0, + "step": 143 + }, + { + "epoch": 0.018318280117033455, + "grad_norm": 3.9093120098114014, + "learning_rate": 6.061890631623569e-08, + "loss": 0.7269, + "mean_token_accuracy": 0.7886580228805542, + "num_tokens": 5473274.0, + "step": 144 + }, + { + "epoch": 0.018445490395623966, + "grad_norm": 3.6105480194091797, + "learning_rate": 6.104281475201356e-08, + "loss": 0.7381, + "mean_token_accuracy": 0.7825309038162231, + "num_tokens": 5510394.0, + "step": 145 + }, + { + "epoch": 0.018572700674214477, + "grad_norm": 3.9877655506134033, + "learning_rate": 6.146672318779143e-08, + "loss": 0.7206, + "mean_token_accuracy": 0.7922298908233643, + "num_tokens": 5546570.0, + "step": 146 + }, + { + "epoch": 0.018699910952804988, + "grad_norm": 3.7869184017181396, + "learning_rate": 6.18906316235693e-08, + "loss": 0.7486, + "mean_token_accuracy": 0.7825199365615845, + "num_tokens": 5583687.0, + "step": 147 + }, + { + "epoch": 0.018827121231395495, + "grad_norm": 3.7182555198669434, + "learning_rate": 6.231454005934718e-08, + "loss": 0.7315, + "mean_token_accuracy": 0.7868874669075012, + "num_tokens": 5619290.0, + "step": 148 + }, + { + "epoch": 0.018954331509986006, + "grad_norm": 3.866356372833252, + "learning_rate": 6.273844849512505e-08, + "loss": 0.8594, + "mean_token_accuracy": 0.7556565403938293, + "num_tokens": 5655751.0, + "step": 149 + }, + { + "epoch": 0.019081541788576517, + "grad_norm": 3.148918867111206, + "learning_rate": 6.316235693090292e-08, + "loss": 0.7018, + "mean_token_accuracy": 0.7975557446479797, + "num_tokens": 5701498.0, + "step": 150 + }, + { + "epoch": 0.019208752067167028, + "grad_norm": 3.784848690032959, + "learning_rate": 6.35862653666808e-08, + "loss": 0.7779, + "mean_token_accuracy": 0.7782930731773376, + "num_tokens": 5735602.0, + "step": 151 + }, + { + "epoch": 0.01933596234575754, + "grad_norm": 3.0604395866394043, + "learning_rate": 6.401017380245867e-08, + "loss": 0.6558, + "mean_token_accuracy": 0.8060742616653442, + "num_tokens": 5777558.0, + "step": 152 + }, + { + "epoch": 0.019463172624348046, + "grad_norm": 3.698992967605591, + "learning_rate": 6.443408223823654e-08, + "loss": 0.6932, + "mean_token_accuracy": 0.7976251840591431, + "num_tokens": 5809627.0, + "step": 153 + }, + { + "epoch": 0.019590382902938557, + "grad_norm": 3.493917465209961, + "learning_rate": 6.485799067401441e-08, + "loss": 0.7485, + "mean_token_accuracy": 0.7832435965538025, + "num_tokens": 5847346.0, + "step": 154 + }, + { + "epoch": 0.019717593181529068, + "grad_norm": 4.023364543914795, + "learning_rate": 6.528189910979228e-08, + "loss": 0.7879, + "mean_token_accuracy": 0.7729952335357666, + "num_tokens": 5880631.0, + "step": 155 + }, + { + "epoch": 0.01984480346011958, + "grad_norm": 3.506523609161377, + "learning_rate": 6.570580754557016e-08, + "loss": 0.7487, + "mean_token_accuracy": 0.7844482660293579, + "num_tokens": 5919482.0, + "step": 156 + }, + { + "epoch": 0.019972013738710086, + "grad_norm": 3.4090123176574707, + "learning_rate": 6.612971598134802e-08, + "loss": 0.7237, + "mean_token_accuracy": 0.7894665002822876, + "num_tokens": 5957126.0, + "step": 157 + }, + { + "epoch": 0.020099224017300597, + "grad_norm": 3.3531415462493896, + "learning_rate": 6.655362441712589e-08, + "loss": 0.6994, + "mean_token_accuracy": 0.794451117515564, + "num_tokens": 5994237.0, + "step": 158 + }, + { + "epoch": 0.020226434295891108, + "grad_norm": 3.581841468811035, + "learning_rate": 6.697753285290376e-08, + "loss": 0.7239, + "mean_token_accuracy": 0.7862144112586975, + "num_tokens": 6029093.0, + "step": 159 + }, + { + "epoch": 0.02035364457448162, + "grad_norm": 3.3962488174438477, + "learning_rate": 6.740144128868163e-08, + "loss": 0.6742, + "mean_token_accuracy": 0.8028716444969177, + "num_tokens": 6066185.0, + "step": 160 + }, + { + "epoch": 0.020480854853072127, + "grad_norm": 3.120938777923584, + "learning_rate": 6.78253497244595e-08, + "loss": 0.7259, + "mean_token_accuracy": 0.7903046607971191, + "num_tokens": 6107737.0, + "step": 161 + }, + { + "epoch": 0.020608065131662637, + "grad_norm": 3.5757761001586914, + "learning_rate": 6.824925816023738e-08, + "loss": 0.7118, + "mean_token_accuracy": 0.7846766710281372, + "num_tokens": 6140277.0, + "step": 162 + }, + { + "epoch": 0.02073527541025315, + "grad_norm": 3.3938517570495605, + "learning_rate": 6.867316659601525e-08, + "loss": 0.7494, + "mean_token_accuracy": 0.7807459831237793, + "num_tokens": 6181627.0, + "step": 163 + }, + { + "epoch": 0.02086248568884366, + "grad_norm": 3.300715446472168, + "learning_rate": 6.909707503179312e-08, + "loss": 0.7451, + "mean_token_accuracy": 0.7859368920326233, + "num_tokens": 6222406.0, + "step": 164 + }, + { + "epoch": 0.02098969596743417, + "grad_norm": 3.1637165546417236, + "learning_rate": 6.9520983467571e-08, + "loss": 0.7323, + "mean_token_accuracy": 0.7860749959945679, + "num_tokens": 6261796.0, + "step": 165 + }, + { + "epoch": 0.021116906246024678, + "grad_norm": 3.6255595684051514, + "learning_rate": 6.994489190334887e-08, + "loss": 0.7255, + "mean_token_accuracy": 0.7870409488677979, + "num_tokens": 6296514.0, + "step": 166 + }, + { + "epoch": 0.02124411652461519, + "grad_norm": 3.0641448497772217, + "learning_rate": 7.036880033912674e-08, + "loss": 0.7401, + "mean_token_accuracy": 0.7840508222579956, + "num_tokens": 6340455.0, + "step": 167 + }, + { + "epoch": 0.0213713268032057, + "grad_norm": 3.347264289855957, + "learning_rate": 7.079270877490461e-08, + "loss": 0.7377, + "mean_token_accuracy": 0.7851111888885498, + "num_tokens": 6379707.0, + "step": 168 + }, + { + "epoch": 0.02149853708179621, + "grad_norm": 3.10895037651062, + "learning_rate": 7.121661721068249e-08, + "loss": 0.7115, + "mean_token_accuracy": 0.7948207259178162, + "num_tokens": 6420225.0, + "step": 169 + }, + { + "epoch": 0.021625747360386718, + "grad_norm": 3.756281614303589, + "learning_rate": 7.164052564646036e-08, + "loss": 0.6959, + "mean_token_accuracy": 0.7936943769454956, + "num_tokens": 6451401.0, + "step": 170 + }, + { + "epoch": 0.02175295763897723, + "grad_norm": 3.494393825531006, + "learning_rate": 7.206443408223823e-08, + "loss": 0.8588, + "mean_token_accuracy": 0.7479308843612671, + "num_tokens": 6493360.0, + "step": 171 + }, + { + "epoch": 0.02188016791756774, + "grad_norm": 3.364917516708374, + "learning_rate": 7.24883425180161e-08, + "loss": 0.7473, + "mean_token_accuracy": 0.7842448949813843, + "num_tokens": 6529103.0, + "step": 172 + }, + { + "epoch": 0.02200737819615825, + "grad_norm": 3.6001944541931152, + "learning_rate": 7.291225095379398e-08, + "loss": 0.7668, + "mean_token_accuracy": 0.7786641716957092, + "num_tokens": 6565749.0, + "step": 173 + }, + { + "epoch": 0.02213458847474876, + "grad_norm": 2.9079387187957764, + "learning_rate": 7.333615938957185e-08, + "loss": 0.7182, + "mean_token_accuracy": 0.7866941094398499, + "num_tokens": 6609991.0, + "step": 174 + }, + { + "epoch": 0.02226179875333927, + "grad_norm": 3.459444284439087, + "learning_rate": 7.376006782534971e-08, + "loss": 0.7142, + "mean_token_accuracy": 0.7897404432296753, + "num_tokens": 6641926.0, + "step": 175 + }, + { + "epoch": 0.02238900903192978, + "grad_norm": 3.785703659057617, + "learning_rate": 7.418397626112758e-08, + "loss": 0.7143, + "mean_token_accuracy": 0.7909746170043945, + "num_tokens": 6675761.0, + "step": 176 + }, + { + "epoch": 0.02251621931052029, + "grad_norm": 3.311720371246338, + "learning_rate": 7.460788469690545e-08, + "loss": 0.7564, + "mean_token_accuracy": 0.7765612602233887, + "num_tokens": 6715012.0, + "step": 177 + }, + { + "epoch": 0.0226434295891108, + "grad_norm": 3.365351915359497, + "learning_rate": 7.503179313268333e-08, + "loss": 0.7081, + "mean_token_accuracy": 0.7909303903579712, + "num_tokens": 6754675.0, + "step": 178 + }, + { + "epoch": 0.02277063986770131, + "grad_norm": 3.2877259254455566, + "learning_rate": 7.54557015684612e-08, + "loss": 0.6471, + "mean_token_accuracy": 0.810468316078186, + "num_tokens": 6793221.0, + "step": 179 + }, + { + "epoch": 0.02289785014629182, + "grad_norm": 3.258385419845581, + "learning_rate": 7.587961000423907e-08, + "loss": 0.7581, + "mean_token_accuracy": 0.7767417430877686, + "num_tokens": 6831051.0, + "step": 180 + }, + { + "epoch": 0.02302506042488233, + "grad_norm": 3.4091851711273193, + "learning_rate": 7.630351844001694e-08, + "loss": 0.8028, + "mean_token_accuracy": 0.7604841589927673, + "num_tokens": 6867866.0, + "step": 181 + }, + { + "epoch": 0.02315227070347284, + "grad_norm": 3.221931219100952, + "learning_rate": 7.672742687579482e-08, + "loss": 0.7468, + "mean_token_accuracy": 0.7812994718551636, + "num_tokens": 6908601.0, + "step": 182 + }, + { + "epoch": 0.02327948098206335, + "grad_norm": 3.536679983139038, + "learning_rate": 7.715133531157269e-08, + "loss": 0.7048, + "mean_token_accuracy": 0.7968044877052307, + "num_tokens": 6942088.0, + "step": 183 + }, + { + "epoch": 0.02340669126065386, + "grad_norm": 3.459209442138672, + "learning_rate": 7.757524374735056e-08, + "loss": 0.7084, + "mean_token_accuracy": 0.7908079028129578, + "num_tokens": 6978999.0, + "step": 184 + }, + { + "epoch": 0.02353390153924437, + "grad_norm": 3.9978010654449463, + "learning_rate": 7.799915218312844e-08, + "loss": 0.7087, + "mean_token_accuracy": 0.7897510528564453, + "num_tokens": 7010909.0, + "step": 185 + }, + { + "epoch": 0.02366111181783488, + "grad_norm": 3.2191410064697266, + "learning_rate": 7.842306061890631e-08, + "loss": 0.7272, + "mean_token_accuracy": 0.787298321723938, + "num_tokens": 7049066.0, + "step": 186 + }, + { + "epoch": 0.023788322096425393, + "grad_norm": 4.111948013305664, + "learning_rate": 7.88469690546842e-08, + "loss": 0.7041, + "mean_token_accuracy": 0.7936994433403015, + "num_tokens": 7078335.0, + "step": 187 + }, + { + "epoch": 0.0239155323750159, + "grad_norm": 3.3511312007904053, + "learning_rate": 7.927087749046207e-08, + "loss": 0.7257, + "mean_token_accuracy": 0.7889295816421509, + "num_tokens": 7117165.0, + "step": 188 + }, + { + "epoch": 0.02404274265360641, + "grad_norm": 3.423619031906128, + "learning_rate": 7.969478592623994e-08, + "loss": 0.699, + "mean_token_accuracy": 0.7941707372665405, + "num_tokens": 7158409.0, + "step": 189 + }, + { + "epoch": 0.024169952932196922, + "grad_norm": 3.9014368057250977, + "learning_rate": 8.011869436201781e-08, + "loss": 0.6802, + "mean_token_accuracy": 0.7979208827018738, + "num_tokens": 7194317.0, + "step": 190 + }, + { + "epoch": 0.024297163210787433, + "grad_norm": 3.041771173477173, + "learning_rate": 8.054260279779568e-08, + "loss": 0.7243, + "mean_token_accuracy": 0.7850596308708191, + "num_tokens": 7236014.0, + "step": 191 + }, + { + "epoch": 0.02442437348937794, + "grad_norm": 3.0736849308013916, + "learning_rate": 8.096651123357356e-08, + "loss": 0.6832, + "mean_token_accuracy": 0.7985824942588806, + "num_tokens": 7272399.0, + "step": 192 + }, + { + "epoch": 0.02455158376796845, + "grad_norm": 3.607764720916748, + "learning_rate": 8.139041966935143e-08, + "loss": 0.6914, + "mean_token_accuracy": 0.7941519021987915, + "num_tokens": 7312818.0, + "step": 193 + }, + { + "epoch": 0.024678794046558962, + "grad_norm": 3.452845573425293, + "learning_rate": 8.181432810512929e-08, + "loss": 0.7091, + "mean_token_accuracy": 0.7906390428543091, + "num_tokens": 7349554.0, + "step": 194 + }, + { + "epoch": 0.024806004325149473, + "grad_norm": 4.02845573425293, + "learning_rate": 8.223823654090716e-08, + "loss": 0.737, + "mean_token_accuracy": 0.7812983989715576, + "num_tokens": 7386803.0, + "step": 195 + }, + { + "epoch": 0.024933214603739984, + "grad_norm": 2.9343924522399902, + "learning_rate": 8.266214497668503e-08, + "loss": 0.6936, + "mean_token_accuracy": 0.7985713481903076, + "num_tokens": 7432539.0, + "step": 196 + }, + { + "epoch": 0.02506042488233049, + "grad_norm": 3.4381721019744873, + "learning_rate": 8.30860534124629e-08, + "loss": 0.6777, + "mean_token_accuracy": 0.7969809770584106, + "num_tokens": 7470720.0, + "step": 197 + }, + { + "epoch": 0.025187635160921002, + "grad_norm": 3.761683702468872, + "learning_rate": 8.350996184824078e-08, + "loss": 0.7018, + "mean_token_accuracy": 0.7907449007034302, + "num_tokens": 7503988.0, + "step": 198 + }, + { + "epoch": 0.025314845439511513, + "grad_norm": 3.623110294342041, + "learning_rate": 8.393387028401865e-08, + "loss": 0.6551, + "mean_token_accuracy": 0.8033718466758728, + "num_tokens": 7540871.0, + "step": 199 + }, + { + "epoch": 0.025442055718102024, + "grad_norm": 3.955375909805298, + "learning_rate": 8.435777871979652e-08, + "loss": 0.7243, + "mean_token_accuracy": 0.786307692527771, + "num_tokens": 7580269.0, + "step": 200 + }, + { + "epoch": 0.02556926599669253, + "grad_norm": 4.12459135055542, + "learning_rate": 8.47816871555744e-08, + "loss": 0.7308, + "mean_token_accuracy": 0.7823891639709473, + "num_tokens": 7617089.0, + "step": 201 + }, + { + "epoch": 0.025696476275283042, + "grad_norm": 3.1486270427703857, + "learning_rate": 8.520559559135227e-08, + "loss": 0.6596, + "mean_token_accuracy": 0.8062372207641602, + "num_tokens": 7656884.0, + "step": 202 + }, + { + "epoch": 0.025823686553873553, + "grad_norm": 3.9036388397216797, + "learning_rate": 8.562950402713014e-08, + "loss": 0.7032, + "mean_token_accuracy": 0.7919281721115112, + "num_tokens": 7696604.0, + "step": 203 + }, + { + "epoch": 0.025950896832464064, + "grad_norm": 3.995670795440674, + "learning_rate": 8.605341246290801e-08, + "loss": 0.7064, + "mean_token_accuracy": 0.7880663871765137, + "num_tokens": 7731357.0, + "step": 204 + }, + { + "epoch": 0.026078107111054575, + "grad_norm": 3.2689573764801025, + "learning_rate": 8.647732089868589e-08, + "loss": 0.7124, + "mean_token_accuracy": 0.7874469757080078, + "num_tokens": 7765972.0, + "step": 205 + }, + { + "epoch": 0.026205317389645082, + "grad_norm": 2.7404749393463135, + "learning_rate": 8.690122933446376e-08, + "loss": 0.7137, + "mean_token_accuracy": 0.7937867641448975, + "num_tokens": 7806959.0, + "step": 206 + }, + { + "epoch": 0.026332527668235593, + "grad_norm": 4.092940330505371, + "learning_rate": 8.732513777024163e-08, + "loss": 0.6845, + "mean_token_accuracy": 0.7991146445274353, + "num_tokens": 7842465.0, + "step": 207 + }, + { + "epoch": 0.026459737946826104, + "grad_norm": 3.0634765625, + "learning_rate": 8.77490462060195e-08, + "loss": 0.716, + "mean_token_accuracy": 0.7892230749130249, + "num_tokens": 7881583.0, + "step": 208 + }, + { + "epoch": 0.026586948225416615, + "grad_norm": 3.905644416809082, + "learning_rate": 8.817295464179738e-08, + "loss": 0.6401, + "mean_token_accuracy": 0.8087812066078186, + "num_tokens": 7923152.0, + "step": 209 + }, + { + "epoch": 0.026714158504007122, + "grad_norm": 3.7504637241363525, + "learning_rate": 8.859686307757525e-08, + "loss": 0.7546, + "mean_token_accuracy": 0.775786280632019, + "num_tokens": 7962466.0, + "step": 210 + }, + { + "epoch": 0.026841368782597633, + "grad_norm": 3.329334259033203, + "learning_rate": 8.902077151335312e-08, + "loss": 0.7139, + "mean_token_accuracy": 0.7881566286087036, + "num_tokens": 7999794.0, + "step": 211 + }, + { + "epoch": 0.026968579061188144, + "grad_norm": 2.8758344650268555, + "learning_rate": 8.944467994913098e-08, + "loss": 0.6232, + "mean_token_accuracy": 0.8119540214538574, + "num_tokens": 8036568.0, + "step": 212 + }, + { + "epoch": 0.027095789339778655, + "grad_norm": 3.4155962467193604, + "learning_rate": 8.986858838490885e-08, + "loss": 0.6681, + "mean_token_accuracy": 0.7936084270477295, + "num_tokens": 8073746.0, + "step": 213 + }, + { + "epoch": 0.027222999618369163, + "grad_norm": 5.109255790710449, + "learning_rate": 9.029249682068673e-08, + "loss": 0.653, + "mean_token_accuracy": 0.8023583889007568, + "num_tokens": 8109788.0, + "step": 214 + }, + { + "epoch": 0.027350209896959674, + "grad_norm": 4.730036735534668, + "learning_rate": 9.07164052564646e-08, + "loss": 0.6357, + "mean_token_accuracy": 0.8089730739593506, + "num_tokens": 8144547.0, + "step": 215 + }, + { + "epoch": 0.027477420175550184, + "grad_norm": 3.7595882415771484, + "learning_rate": 9.114031369224247e-08, + "loss": 0.6414, + "mean_token_accuracy": 0.8110851049423218, + "num_tokens": 8193334.0, + "step": 216 + }, + { + "epoch": 0.027604630454140695, + "grad_norm": 3.204392433166504, + "learning_rate": 9.156422212802034e-08, + "loss": 0.7091, + "mean_token_accuracy": 0.786286473274231, + "num_tokens": 8232269.0, + "step": 217 + }, + { + "epoch": 0.027731840732731206, + "grad_norm": 4.308266639709473, + "learning_rate": 9.198813056379822e-08, + "loss": 0.7633, + "mean_token_accuracy": 0.7803970575332642, + "num_tokens": 8268124.0, + "step": 218 + }, + { + "epoch": 0.027859051011321714, + "grad_norm": 3.6024556159973145, + "learning_rate": 9.241203899957609e-08, + "loss": 0.6637, + "mean_token_accuracy": 0.8004720211029053, + "num_tokens": 8305704.0, + "step": 219 + }, + { + "epoch": 0.027986261289912225, + "grad_norm": 3.5572550296783447, + "learning_rate": 9.283594743535396e-08, + "loss": 0.7456, + "mean_token_accuracy": 0.7746098041534424, + "num_tokens": 8344761.0, + "step": 220 + }, + { + "epoch": 0.028113471568502735, + "grad_norm": 2.9198944568634033, + "learning_rate": 9.325985587113183e-08, + "loss": 0.6208, + "mean_token_accuracy": 0.817308783531189, + "num_tokens": 8383201.0, + "step": 221 + }, + { + "epoch": 0.028240681847093246, + "grad_norm": 5.155540466308594, + "learning_rate": 9.368376430690971e-08, + "loss": 0.6747, + "mean_token_accuracy": 0.7969631552696228, + "num_tokens": 8418688.0, + "step": 222 + }, + { + "epoch": 0.028367892125683754, + "grad_norm": 4.769636631011963, + "learning_rate": 9.410767274268758e-08, + "loss": 0.596, + "mean_token_accuracy": 0.8222938776016235, + "num_tokens": 8455786.0, + "step": 223 + }, + { + "epoch": 0.028495102404274265, + "grad_norm": 4.5710625648498535, + "learning_rate": 9.453158117846545e-08, + "loss": 0.6929, + "mean_token_accuracy": 0.7932450175285339, + "num_tokens": 8488007.0, + "step": 224 + }, + { + "epoch": 0.028622312682864776, + "grad_norm": 3.4059808254241943, + "learning_rate": 9.495548961424333e-08, + "loss": 0.657, + "mean_token_accuracy": 0.8025411367416382, + "num_tokens": 8529586.0, + "step": 225 + }, + { + "epoch": 0.028749522961455286, + "grad_norm": 4.061327934265137, + "learning_rate": 9.53793980500212e-08, + "loss": 0.6826, + "mean_token_accuracy": 0.794485330581665, + "num_tokens": 8564912.0, + "step": 226 + }, + { + "epoch": 0.028876733240045797, + "grad_norm": 4.632757663726807, + "learning_rate": 9.580330648579907e-08, + "loss": 0.6315, + "mean_token_accuracy": 0.8087653517723083, + "num_tokens": 8604268.0, + "step": 227 + }, + { + "epoch": 0.029003943518636305, + "grad_norm": 4.7451982498168945, + "learning_rate": 9.622721492157694e-08, + "loss": 0.6709, + "mean_token_accuracy": 0.7955394387245178, + "num_tokens": 8643935.0, + "step": 228 + }, + { + "epoch": 0.029131153797226816, + "grad_norm": 3.498277425765991, + "learning_rate": 9.665112335735482e-08, + "loss": 0.7448, + "mean_token_accuracy": 0.7786338925361633, + "num_tokens": 8677348.0, + "step": 229 + }, + { + "epoch": 0.029258364075817327, + "grad_norm": 3.580575942993164, + "learning_rate": 9.707503179313267e-08, + "loss": 0.6606, + "mean_token_accuracy": 0.7990199327468872, + "num_tokens": 8718747.0, + "step": 230 + }, + { + "epoch": 0.029385574354407838, + "grad_norm": 3.55259108543396, + "learning_rate": 9.749894022891055e-08, + "loss": 0.6498, + "mean_token_accuracy": 0.7997193932533264, + "num_tokens": 8756317.0, + "step": 231 + }, + { + "epoch": 0.029512784632998345, + "grad_norm": 3.9624712467193604, + "learning_rate": 9.792284866468842e-08, + "loss": 0.6645, + "mean_token_accuracy": 0.8007245063781738, + "num_tokens": 8798903.0, + "step": 232 + }, + { + "epoch": 0.029639994911588856, + "grad_norm": 3.542154312133789, + "learning_rate": 9.834675710046629e-08, + "loss": 0.6636, + "mean_token_accuracy": 0.7988182902336121, + "num_tokens": 8839766.0, + "step": 233 + }, + { + "epoch": 0.029767205190179367, + "grad_norm": 3.6927640438079834, + "learning_rate": 9.877066553624416e-08, + "loss": 0.6975, + "mean_token_accuracy": 0.7885463237762451, + "num_tokens": 8878542.0, + "step": 234 + }, + { + "epoch": 0.029894415468769878, + "grad_norm": 3.5314691066741943, + "learning_rate": 9.919457397202204e-08, + "loss": 0.6194, + "mean_token_accuracy": 0.8113341927528381, + "num_tokens": 8913426.0, + "step": 235 + }, + { + "epoch": 0.030021625747360385, + "grad_norm": 4.622177600860596, + "learning_rate": 9.961848240779991e-08, + "loss": 0.6539, + "mean_token_accuracy": 0.8024784922599792, + "num_tokens": 8953593.0, + "step": 236 + }, + { + "epoch": 0.030148836025950896, + "grad_norm": 4.145118713378906, + "learning_rate": 1.0004239084357778e-07, + "loss": 0.6836, + "mean_token_accuracy": 0.7959121465682983, + "num_tokens": 8994076.0, + "step": 237 + }, + { + "epoch": 0.030276046304541407, + "grad_norm": 3.732285261154175, + "learning_rate": 1.0046629927935566e-07, + "loss": 0.6096, + "mean_token_accuracy": 0.8150599002838135, + "num_tokens": 9035109.0, + "step": 238 + }, + { + "epoch": 0.030403256583131918, + "grad_norm": 4.198469161987305, + "learning_rate": 1.0089020771513353e-07, + "loss": 0.6363, + "mean_token_accuracy": 0.809563934803009, + "num_tokens": 9073690.0, + "step": 239 + }, + { + "epoch": 0.03053046686172243, + "grad_norm": 3.417163610458374, + "learning_rate": 1.013141161509114e-07, + "loss": 0.599, + "mean_token_accuracy": 0.8180171251296997, + "num_tokens": 9114416.0, + "step": 240 + }, + { + "epoch": 0.030657677140312936, + "grad_norm": 3.580732583999634, + "learning_rate": 1.0173802458668927e-07, + "loss": 0.5903, + "mean_token_accuracy": 0.8213066458702087, + "num_tokens": 9150254.0, + "step": 241 + }, + { + "epoch": 0.030784887418903447, + "grad_norm": 5.374983310699463, + "learning_rate": 1.0216193302246715e-07, + "loss": 0.6583, + "mean_token_accuracy": 0.7992425560951233, + "num_tokens": 9186122.0, + "step": 242 + }, + { + "epoch": 0.030912097697493958, + "grad_norm": 5.032679080963135, + "learning_rate": 1.0258584145824502e-07, + "loss": 0.7201, + "mean_token_accuracy": 0.7877621650695801, + "num_tokens": 9224080.0, + "step": 243 + }, + { + "epoch": 0.03103930797608447, + "grad_norm": 3.950942277908325, + "learning_rate": 1.0300974989402289e-07, + "loss": 0.6434, + "mean_token_accuracy": 0.8095223903656006, + "num_tokens": 9268908.0, + "step": 244 + }, + { + "epoch": 0.031166518254674976, + "grad_norm": 4.672729969024658, + "learning_rate": 1.0343365832980076e-07, + "loss": 0.6261, + "mean_token_accuracy": 0.8095069527626038, + "num_tokens": 9307461.0, + "step": 245 + }, + { + "epoch": 0.03129372853326549, + "grad_norm": 3.6456544399261475, + "learning_rate": 1.0385756676557864e-07, + "loss": 0.6977, + "mean_token_accuracy": 0.7876970171928406, + "num_tokens": 9347875.0, + "step": 246 + }, + { + "epoch": 0.031420938811855995, + "grad_norm": 4.373722076416016, + "learning_rate": 1.0428147520135651e-07, + "loss": 0.686, + "mean_token_accuracy": 0.791498064994812, + "num_tokens": 9386288.0, + "step": 247 + }, + { + "epoch": 0.03154814909044651, + "grad_norm": 4.167595386505127, + "learning_rate": 1.0470538363713437e-07, + "loss": 0.6463, + "mean_token_accuracy": 0.8041632175445557, + "num_tokens": 9420281.0, + "step": 248 + }, + { + "epoch": 0.031675359369037016, + "grad_norm": 4.30181360244751, + "learning_rate": 1.0512929207291224e-07, + "loss": 0.6305, + "mean_token_accuracy": 0.8083238005638123, + "num_tokens": 9460932.0, + "step": 249 + }, + { + "epoch": 0.03180256964762753, + "grad_norm": 4.028106212615967, + "learning_rate": 1.0555320050869011e-07, + "loss": 0.6061, + "mean_token_accuracy": 0.816354513168335, + "num_tokens": 9499415.0, + "step": 250 + }, + { + "epoch": 0.03192977992621804, + "grad_norm": 5.639784812927246, + "learning_rate": 1.0597710894446799e-07, + "loss": 0.645, + "mean_token_accuracy": 0.8040623664855957, + "num_tokens": 9538314.0, + "step": 251 + }, + { + "epoch": 0.032056990204808546, + "grad_norm": 6.038763046264648, + "learning_rate": 1.0640101738024586e-07, + "loss": 0.7253, + "mean_token_accuracy": 0.7808339595794678, + "num_tokens": 9573643.0, + "step": 252 + }, + { + "epoch": 0.03218420048339906, + "grad_norm": 5.391833782196045, + "learning_rate": 1.0682492581602373e-07, + "loss": 0.672, + "mean_token_accuracy": 0.8016908168792725, + "num_tokens": 9618799.0, + "step": 253 + }, + { + "epoch": 0.03231141076198957, + "grad_norm": 4.231356143951416, + "learning_rate": 1.072488342518016e-07, + "loss": 0.623, + "mean_token_accuracy": 0.81174236536026, + "num_tokens": 9658460.0, + "step": 254 + }, + { + "epoch": 0.03243862104058008, + "grad_norm": 4.998953342437744, + "learning_rate": 1.0767274268757948e-07, + "loss": 0.7198, + "mean_token_accuracy": 0.784131646156311, + "num_tokens": 9698015.0, + "step": 255 + }, + { + "epoch": 0.03256583131917059, + "grad_norm": 3.7710821628570557, + "learning_rate": 1.0809665112335735e-07, + "loss": 0.6506, + "mean_token_accuracy": 0.8052817583084106, + "num_tokens": 9736284.0, + "step": 256 + }, + { + "epoch": 0.0326930415977611, + "grad_norm": 5.407049655914307, + "learning_rate": 1.0852055955913522e-07, + "loss": 0.7061, + "mean_token_accuracy": 0.7891525626182556, + "num_tokens": 9780131.0, + "step": 257 + }, + { + "epoch": 0.03282025187635161, + "grad_norm": 3.9581258296966553, + "learning_rate": 1.089444679949131e-07, + "loss": 0.6899, + "mean_token_accuracy": 0.790682315826416, + "num_tokens": 9819420.0, + "step": 258 + }, + { + "epoch": 0.03294746215494212, + "grad_norm": 5.3281097412109375, + "learning_rate": 1.0936837643069097e-07, + "loss": 0.6139, + "mean_token_accuracy": 0.8126059174537659, + "num_tokens": 9858002.0, + "step": 259 + }, + { + "epoch": 0.03307467243353263, + "grad_norm": 5.772911071777344, + "learning_rate": 1.0979228486646884e-07, + "loss": 0.7048, + "mean_token_accuracy": 0.7873782515525818, + "num_tokens": 9894120.0, + "step": 260 + }, + { + "epoch": 0.03320188271212314, + "grad_norm": 5.435359477996826, + "learning_rate": 1.1021619330224671e-07, + "loss": 0.7697, + "mean_token_accuracy": 0.7667263746261597, + "num_tokens": 9924849.0, + "step": 261 + }, + { + "epoch": 0.03332909299071365, + "grad_norm": 7.158500671386719, + "learning_rate": 1.1064010173802458e-07, + "loss": 0.6323, + "mean_token_accuracy": 0.8075273633003235, + "num_tokens": 9960751.0, + "step": 262 + }, + { + "epoch": 0.03345630326930416, + "grad_norm": 4.339040279388428, + "learning_rate": 1.1106401017380246e-07, + "loss": 0.6545, + "mean_token_accuracy": 0.8015820980072021, + "num_tokens": 9997988.0, + "step": 263 + }, + { + "epoch": 0.03358351354789467, + "grad_norm": 3.748641014099121, + "learning_rate": 1.1148791860958033e-07, + "loss": 0.6026, + "mean_token_accuracy": 0.8164246082305908, + "num_tokens": 10029286.0, + "step": 264 + }, + { + "epoch": 0.03371072382648518, + "grad_norm": 4.060466289520264, + "learning_rate": 1.119118270453582e-07, + "loss": 0.627, + "mean_token_accuracy": 0.8098658323287964, + "num_tokens": 10068216.0, + "step": 265 + }, + { + "epoch": 0.03383793410507569, + "grad_norm": 4.45255184173584, + "learning_rate": 1.1233573548113607e-07, + "loss": 0.5462, + "mean_token_accuracy": 0.8324424028396606, + "num_tokens": 10106729.0, + "step": 266 + }, + { + "epoch": 0.0339651443836662, + "grad_norm": 4.932199954986572, + "learning_rate": 1.1275964391691393e-07, + "loss": 0.6865, + "mean_token_accuracy": 0.7940630912780762, + "num_tokens": 10144554.0, + "step": 267 + }, + { + "epoch": 0.03409235466225671, + "grad_norm": 4.183717727661133, + "learning_rate": 1.131835523526918e-07, + "loss": 0.6487, + "mean_token_accuracy": 0.8007873296737671, + "num_tokens": 10181300.0, + "step": 268 + }, + { + "epoch": 0.03421956494084722, + "grad_norm": 4.277474880218506, + "learning_rate": 1.1360746078846968e-07, + "loss": 0.6175, + "mean_token_accuracy": 0.8146201372146606, + "num_tokens": 10218210.0, + "step": 269 + }, + { + "epoch": 0.03434677521943773, + "grad_norm": 4.094005584716797, + "learning_rate": 1.1403136922424755e-07, + "loss": 0.7014, + "mean_token_accuracy": 0.7841821908950806, + "num_tokens": 10254876.0, + "step": 270 + }, + { + "epoch": 0.03447398549802824, + "grad_norm": 3.945847511291504, + "learning_rate": 1.1445527766002542e-07, + "loss": 0.6626, + "mean_token_accuracy": 0.8006522059440613, + "num_tokens": 10296679.0, + "step": 271 + }, + { + "epoch": 0.03460119577661875, + "grad_norm": 4.4878973960876465, + "learning_rate": 1.148791860958033e-07, + "loss": 0.7039, + "mean_token_accuracy": 0.7868071794509888, + "num_tokens": 10333286.0, + "step": 272 + }, + { + "epoch": 0.034728406055209264, + "grad_norm": 7.2124342918396, + "learning_rate": 1.1530309453158117e-07, + "loss": 0.6986, + "mean_token_accuracy": 0.7870566844940186, + "num_tokens": 10362425.0, + "step": 273 + }, + { + "epoch": 0.03485561633379977, + "grad_norm": 5.665987968444824, + "learning_rate": 1.1572700296735904e-07, + "loss": 0.6939, + "mean_token_accuracy": 0.7897220849990845, + "num_tokens": 10401629.0, + "step": 274 + }, + { + "epoch": 0.03498282661239028, + "grad_norm": 3.1642048358917236, + "learning_rate": 1.1615091140313691e-07, + "loss": 0.6588, + "mean_token_accuracy": 0.8037142753601074, + "num_tokens": 10439333.0, + "step": 275 + }, + { + "epoch": 0.03511003689098079, + "grad_norm": 5.4103193283081055, + "learning_rate": 1.1657481983891479e-07, + "loss": 0.6278, + "mean_token_accuracy": 0.8119738101959229, + "num_tokens": 10476170.0, + "step": 276 + }, + { + "epoch": 0.0352372471695713, + "grad_norm": 4.8055033683776855, + "learning_rate": 1.1699872827469266e-07, + "loss": 0.6438, + "mean_token_accuracy": 0.8023854494094849, + "num_tokens": 10513298.0, + "step": 277 + }, + { + "epoch": 0.03536445744816181, + "grad_norm": 4.231077194213867, + "learning_rate": 1.1742263671047053e-07, + "loss": 0.6724, + "mean_token_accuracy": 0.797193706035614, + "num_tokens": 10551149.0, + "step": 278 + }, + { + "epoch": 0.03549166772675232, + "grad_norm": 5.453915119171143, + "learning_rate": 1.178465451462484e-07, + "loss": 0.6137, + "mean_token_accuracy": 0.812427818775177, + "num_tokens": 10590034.0, + "step": 279 + }, + { + "epoch": 0.03561887800534283, + "grad_norm": 4.5331339836120605, + "learning_rate": 1.1827045358202628e-07, + "loss": 0.5773, + "mean_token_accuracy": 0.821941614151001, + "num_tokens": 10626670.0, + "step": 280 + }, + { + "epoch": 0.035746088283933344, + "grad_norm": 4.380234241485596, + "learning_rate": 1.1869436201780415e-07, + "loss": 0.6432, + "mean_token_accuracy": 0.7999981641769409, + "num_tokens": 10661319.0, + "step": 281 + }, + { + "epoch": 0.03587329856252385, + "grad_norm": 4.138274669647217, + "learning_rate": 1.1911827045358202e-07, + "loss": 0.6692, + "mean_token_accuracy": 0.8055537939071655, + "num_tokens": 10700705.0, + "step": 282 + }, + { + "epoch": 0.03600050884111436, + "grad_norm": 4.401586055755615, + "learning_rate": 1.195421788893599e-07, + "loss": 0.6489, + "mean_token_accuracy": 0.8021796941757202, + "num_tokens": 10734985.0, + "step": 283 + }, + { + "epoch": 0.036127719119704874, + "grad_norm": 3.9735610485076904, + "learning_rate": 1.1996608732513778e-07, + "loss": 0.6273, + "mean_token_accuracy": 0.808967649936676, + "num_tokens": 10775868.0, + "step": 284 + }, + { + "epoch": 0.03625492939829538, + "grad_norm": 4.904382705688477, + "learning_rate": 1.2038999576091563e-07, + "loss": 0.6003, + "mean_token_accuracy": 0.8175035715103149, + "num_tokens": 10815638.0, + "step": 285 + }, + { + "epoch": 0.036382139676885895, + "grad_norm": 5.48204231262207, + "learning_rate": 1.208139041966935e-07, + "loss": 0.6219, + "mean_token_accuracy": 0.8122063875198364, + "num_tokens": 10853402.0, + "step": 286 + }, + { + "epoch": 0.0365093499554764, + "grad_norm": 5.209934711456299, + "learning_rate": 1.2123781263247137e-07, + "loss": 0.6529, + "mean_token_accuracy": 0.8025312423706055, + "num_tokens": 10887031.0, + "step": 287 + }, + { + "epoch": 0.03663656023406691, + "grad_norm": 5.906895160675049, + "learning_rate": 1.2166172106824924e-07, + "loss": 0.6456, + "mean_token_accuracy": 0.8023965358734131, + "num_tokens": 10924568.0, + "step": 288 + }, + { + "epoch": 0.036763770512657425, + "grad_norm": 5.628559589385986, + "learning_rate": 1.2208562950402712e-07, + "loss": 0.6866, + "mean_token_accuracy": 0.7932870984077454, + "num_tokens": 10959794.0, + "step": 289 + }, + { + "epoch": 0.03689098079124793, + "grad_norm": 5.278966903686523, + "learning_rate": 1.22509537939805e-07, + "loss": 0.7082, + "mean_token_accuracy": 0.786495566368103, + "num_tokens": 10998746.0, + "step": 290 + }, + { + "epoch": 0.03701819106983844, + "grad_norm": 4.715980052947998, + "learning_rate": 1.2293344637558286e-07, + "loss": 0.6091, + "mean_token_accuracy": 0.8119910359382629, + "num_tokens": 11038232.0, + "step": 291 + }, + { + "epoch": 0.037145401348428954, + "grad_norm": 5.2078094482421875, + "learning_rate": 1.2335735481136073e-07, + "loss": 0.6439, + "mean_token_accuracy": 0.8041412830352783, + "num_tokens": 11075448.0, + "step": 292 + }, + { + "epoch": 0.03727261162701946, + "grad_norm": 4.498593330383301, + "learning_rate": 1.237812632471386e-07, + "loss": 0.6424, + "mean_token_accuracy": 0.8072681427001953, + "num_tokens": 11118255.0, + "step": 293 + }, + { + "epoch": 0.037399821905609976, + "grad_norm": 4.349668502807617, + "learning_rate": 1.2420517168291648e-07, + "loss": 0.5963, + "mean_token_accuracy": 0.8174676895141602, + "num_tokens": 11157661.0, + "step": 294 + }, + { + "epoch": 0.03752703218420048, + "grad_norm": 4.797399997711182, + "learning_rate": 1.2462908011869435e-07, + "loss": 0.6513, + "mean_token_accuracy": 0.8004592657089233, + "num_tokens": 11196868.0, + "step": 295 + }, + { + "epoch": 0.03765424246279099, + "grad_norm": 4.453806400299072, + "learning_rate": 1.2505298855447223e-07, + "loss": 0.5805, + "mean_token_accuracy": 0.8219411373138428, + "num_tokens": 11235143.0, + "step": 296 + }, + { + "epoch": 0.037781452741381505, + "grad_norm": 4.523251056671143, + "learning_rate": 1.254768969902501e-07, + "loss": 0.6681, + "mean_token_accuracy": 0.7951433062553406, + "num_tokens": 11280292.0, + "step": 297 + }, + { + "epoch": 0.03790866301997201, + "grad_norm": 3.3392159938812256, + "learning_rate": 1.2590080542602797e-07, + "loss": 0.618, + "mean_token_accuracy": 0.8134062886238098, + "num_tokens": 11323359.0, + "step": 298 + }, + { + "epoch": 0.03803587329856253, + "grad_norm": 4.961188793182373, + "learning_rate": 1.2632471386180584e-07, + "loss": 0.6177, + "mean_token_accuracy": 0.8123466372489929, + "num_tokens": 11362142.0, + "step": 299 + }, + { + "epoch": 0.038163083577153034, + "grad_norm": 6.6218719482421875, + "learning_rate": 1.2674862229758372e-07, + "loss": 0.6306, + "mean_token_accuracy": 0.8054583072662354, + "num_tokens": 11395754.0, + "step": 300 + }, + { + "epoch": 0.03829029385574354, + "grad_norm": 6.031904697418213, + "learning_rate": 1.271725307333616e-07, + "loss": 0.6141, + "mean_token_accuracy": 0.8146783709526062, + "num_tokens": 11433947.0, + "step": 301 + }, + { + "epoch": 0.038417504134334056, + "grad_norm": 5.314549922943115, + "learning_rate": 1.2759643916913946e-07, + "loss": 0.6739, + "mean_token_accuracy": 0.7909435033798218, + "num_tokens": 11471300.0, + "step": 302 + }, + { + "epoch": 0.03854471441292456, + "grad_norm": 4.438421249389648, + "learning_rate": 1.2802034760491733e-07, + "loss": 0.6204, + "mean_token_accuracy": 0.8090065121650696, + "num_tokens": 11509259.0, + "step": 303 + }, + { + "epoch": 0.03867192469151508, + "grad_norm": 5.380069255828857, + "learning_rate": 1.284442560406952e-07, + "loss": 0.6346, + "mean_token_accuracy": 0.8054120540618896, + "num_tokens": 11542499.0, + "step": 304 + }, + { + "epoch": 0.038799134970105585, + "grad_norm": 5.713099479675293, + "learning_rate": 1.2886816447647308e-07, + "loss": 0.6537, + "mean_token_accuracy": 0.7988308668136597, + "num_tokens": 11583260.0, + "step": 305 + }, + { + "epoch": 0.03892634524869609, + "grad_norm": 3.899196147918701, + "learning_rate": 1.2929207291225095e-07, + "loss": 0.6491, + "mean_token_accuracy": 0.7998721599578857, + "num_tokens": 11617778.0, + "step": 306 + }, + { + "epoch": 0.03905355552728661, + "grad_norm": 5.335933685302734, + "learning_rate": 1.2971598134802882e-07, + "loss": 0.6896, + "mean_token_accuracy": 0.789283275604248, + "num_tokens": 11650314.0, + "step": 307 + }, + { + "epoch": 0.039180765805877114, + "grad_norm": 4.612327575683594, + "learning_rate": 1.301398897838067e-07, + "loss": 0.5514, + "mean_token_accuracy": 0.8245725631713867, + "num_tokens": 11687352.0, + "step": 308 + }, + { + "epoch": 0.03930797608446762, + "grad_norm": 5.477524757385254, + "learning_rate": 1.3056379821958457e-07, + "loss": 0.6495, + "mean_token_accuracy": 0.7999975681304932, + "num_tokens": 11724799.0, + "step": 309 + }, + { + "epoch": 0.039435186363058136, + "grad_norm": 3.8578267097473145, + "learning_rate": 1.3098770665536244e-07, + "loss": 0.6467, + "mean_token_accuracy": 0.8038100004196167, + "num_tokens": 11769211.0, + "step": 310 + }, + { + "epoch": 0.039562396641648644, + "grad_norm": 5.463492393493652, + "learning_rate": 1.3141161509114031e-07, + "loss": 0.6745, + "mean_token_accuracy": 0.7911505699157715, + "num_tokens": 11801881.0, + "step": 311 + }, + { + "epoch": 0.03968960692023916, + "grad_norm": 4.346237659454346, + "learning_rate": 1.3183552352691819e-07, + "loss": 0.6146, + "mean_token_accuracy": 0.8115617036819458, + "num_tokens": 11838640.0, + "step": 312 + }, + { + "epoch": 0.039816817198829665, + "grad_norm": 4.673257350921631, + "learning_rate": 1.3225943196269603e-07, + "loss": 0.6957, + "mean_token_accuracy": 0.7880085706710815, + "num_tokens": 11874443.0, + "step": 313 + }, + { + "epoch": 0.03994402747742017, + "grad_norm": 3.745023012161255, + "learning_rate": 1.3268334039847393e-07, + "loss": 0.6011, + "mean_token_accuracy": 0.8160321712493896, + "num_tokens": 11915852.0, + "step": 314 + }, + { + "epoch": 0.04007123775601069, + "grad_norm": 3.6101369857788086, + "learning_rate": 1.3310724883425178e-07, + "loss": 0.6007, + "mean_token_accuracy": 0.8158063888549805, + "num_tokens": 11952224.0, + "step": 315 + }, + { + "epoch": 0.040198448034601195, + "grad_norm": 5.435547351837158, + "learning_rate": 1.3353115727002968e-07, + "loss": 0.6568, + "mean_token_accuracy": 0.7991122007369995, + "num_tokens": 11984589.0, + "step": 316 + }, + { + "epoch": 0.04032565831319171, + "grad_norm": 4.07612419128418, + "learning_rate": 1.3395506570580752e-07, + "loss": 0.5749, + "mean_token_accuracy": 0.8181610107421875, + "num_tokens": 12019568.0, + "step": 317 + }, + { + "epoch": 0.040452868591782216, + "grad_norm": 5.009436130523682, + "learning_rate": 1.3437897414158542e-07, + "loss": 0.6276, + "mean_token_accuracy": 0.8052650094032288, + "num_tokens": 12050095.0, + "step": 318 + }, + { + "epoch": 0.040580078870372724, + "grad_norm": 4.4030632972717285, + "learning_rate": 1.3480288257736327e-07, + "loss": 0.6611, + "mean_token_accuracy": 0.8005396127700806, + "num_tokens": 12086243.0, + "step": 319 + }, + { + "epoch": 0.04070728914896324, + "grad_norm": 3.823237657546997, + "learning_rate": 1.3522679101314117e-07, + "loss": 0.5417, + "mean_token_accuracy": 0.8322736024856567, + "num_tokens": 12121597.0, + "step": 320 + }, + { + "epoch": 0.040834499427553746, + "grad_norm": 4.417186260223389, + "learning_rate": 1.35650699448919e-07, + "loss": 0.6252, + "mean_token_accuracy": 0.807835578918457, + "num_tokens": 12154714.0, + "step": 321 + }, + { + "epoch": 0.04096170970614425, + "grad_norm": 5.166866302490234, + "learning_rate": 1.360746078846969e-07, + "loss": 0.6026, + "mean_token_accuracy": 0.8123235106468201, + "num_tokens": 12189725.0, + "step": 322 + }, + { + "epoch": 0.04108891998473477, + "grad_norm": 5.816406726837158, + "learning_rate": 1.3649851632047476e-07, + "loss": 0.6355, + "mean_token_accuracy": 0.8041502237319946, + "num_tokens": 12229328.0, + "step": 323 + }, + { + "epoch": 0.041216130263325275, + "grad_norm": 5.203060150146484, + "learning_rate": 1.3692242475625266e-07, + "loss": 0.6581, + "mean_token_accuracy": 0.8038981556892395, + "num_tokens": 12267825.0, + "step": 324 + }, + { + "epoch": 0.04134334054191579, + "grad_norm": 3.8511483669281006, + "learning_rate": 1.373463331920305e-07, + "loss": 0.6419, + "mean_token_accuracy": 0.8053834438323975, + "num_tokens": 12305285.0, + "step": 325 + }, + { + "epoch": 0.0414705508205063, + "grad_norm": 3.7711598873138428, + "learning_rate": 1.377702416278084e-07, + "loss": 0.6403, + "mean_token_accuracy": 0.8040913343429565, + "num_tokens": 12342919.0, + "step": 326 + }, + { + "epoch": 0.041597761099096804, + "grad_norm": 4.077350616455078, + "learning_rate": 1.3819415006358625e-07, + "loss": 0.5667, + "mean_token_accuracy": 0.8261089324951172, + "num_tokens": 12380318.0, + "step": 327 + }, + { + "epoch": 0.04172497137768732, + "grad_norm": 4.538190841674805, + "learning_rate": 1.3861805849936415e-07, + "loss": 0.6565, + "mean_token_accuracy": 0.7980632781982422, + "num_tokens": 12418936.0, + "step": 328 + }, + { + "epoch": 0.041852181656277826, + "grad_norm": 4.225887298583984, + "learning_rate": 1.39041966935142e-07, + "loss": 0.6897, + "mean_token_accuracy": 0.7871347069740295, + "num_tokens": 12459537.0, + "step": 329 + }, + { + "epoch": 0.04197939193486834, + "grad_norm": 4.080219268798828, + "learning_rate": 1.394658753709199e-07, + "loss": 0.6438, + "mean_token_accuracy": 0.8011137247085571, + "num_tokens": 12497283.0, + "step": 330 + }, + { + "epoch": 0.04210660221345885, + "grad_norm": 3.533933401107788, + "learning_rate": 1.3988978380669774e-07, + "loss": 0.5979, + "mean_token_accuracy": 0.8145917654037476, + "num_tokens": 12536782.0, + "step": 331 + }, + { + "epoch": 0.042233812492049355, + "grad_norm": 4.611367702484131, + "learning_rate": 1.403136922424756e-07, + "loss": 0.635, + "mean_token_accuracy": 0.8030122518539429, + "num_tokens": 12574283.0, + "step": 332 + }, + { + "epoch": 0.04236102277063987, + "grad_norm": 4.208998680114746, + "learning_rate": 1.4073760067825348e-07, + "loss": 0.6816, + "mean_token_accuracy": 0.791681706905365, + "num_tokens": 12615368.0, + "step": 333 + }, + { + "epoch": 0.04248823304923038, + "grad_norm": 4.09583854675293, + "learning_rate": 1.4116150911403136e-07, + "loss": 0.6395, + "mean_token_accuracy": 0.8031874895095825, + "num_tokens": 12657344.0, + "step": 334 + }, + { + "epoch": 0.04261544332782089, + "grad_norm": 3.2234604358673096, + "learning_rate": 1.4158541754980923e-07, + "loss": 0.6147, + "mean_token_accuracy": 0.8077450394630432, + "num_tokens": 12693142.0, + "step": 335 + }, + { + "epoch": 0.0427426536064114, + "grad_norm": 3.895817518234253, + "learning_rate": 1.420093259855871e-07, + "loss": 0.5694, + "mean_token_accuracy": 0.8232856392860413, + "num_tokens": 12735102.0, + "step": 336 + }, + { + "epoch": 0.042869863885001906, + "grad_norm": 4.067519664764404, + "learning_rate": 1.4243323442136497e-07, + "loss": 0.6161, + "mean_token_accuracy": 0.8119000196456909, + "num_tokens": 12779017.0, + "step": 337 + }, + { + "epoch": 0.04299707416359242, + "grad_norm": 3.4154820442199707, + "learning_rate": 1.4285714285714285e-07, + "loss": 0.6578, + "mean_token_accuracy": 0.7993181943893433, + "num_tokens": 12816068.0, + "step": 338 + }, + { + "epoch": 0.04312428444218293, + "grad_norm": 4.7480902671813965, + "learning_rate": 1.4328105129292072e-07, + "loss": 0.5962, + "mean_token_accuracy": 0.8166620135307312, + "num_tokens": 12850937.0, + "step": 339 + }, + { + "epoch": 0.043251494720773435, + "grad_norm": 4.302094459533691, + "learning_rate": 1.437049597286986e-07, + "loss": 0.6067, + "mean_token_accuracy": 0.8148358464241028, + "num_tokens": 12887067.0, + "step": 340 + }, + { + "epoch": 0.04337870499936395, + "grad_norm": 3.9279024600982666, + "learning_rate": 1.4412886816447646e-07, + "loss": 0.6813, + "mean_token_accuracy": 0.7921576499938965, + "num_tokens": 12922944.0, + "step": 341 + }, + { + "epoch": 0.04350591527795446, + "grad_norm": 4.594663619995117, + "learning_rate": 1.4455277660025434e-07, + "loss": 0.5328, + "mean_token_accuracy": 0.8309823274612427, + "num_tokens": 12954992.0, + "step": 342 + }, + { + "epoch": 0.04363312555654497, + "grad_norm": 3.8019521236419678, + "learning_rate": 1.449766850360322e-07, + "loss": 0.5952, + "mean_token_accuracy": 0.817459762096405, + "num_tokens": 13002183.0, + "step": 343 + }, + { + "epoch": 0.04376033583513548, + "grad_norm": 4.115025997161865, + "learning_rate": 1.4540059347181008e-07, + "loss": 0.5916, + "mean_token_accuracy": 0.8183198571205139, + "num_tokens": 13039160.0, + "step": 344 + }, + { + "epoch": 0.043887546113725986, + "grad_norm": 4.186558723449707, + "learning_rate": 1.4582450190758795e-07, + "loss": 0.6073, + "mean_token_accuracy": 0.8113303184509277, + "num_tokens": 13079139.0, + "step": 345 + }, + { + "epoch": 0.0440147563923165, + "grad_norm": 4.62770414352417, + "learning_rate": 1.4624841034336583e-07, + "loss": 0.6366, + "mean_token_accuracy": 0.800351619720459, + "num_tokens": 13114351.0, + "step": 346 + }, + { + "epoch": 0.04414196667090701, + "grad_norm": 4.226912975311279, + "learning_rate": 1.466723187791437e-07, + "loss": 0.6135, + "mean_token_accuracy": 0.8115348815917969, + "num_tokens": 13149700.0, + "step": 347 + }, + { + "epoch": 0.04426917694949752, + "grad_norm": 3.223947048187256, + "learning_rate": 1.4709622721492157e-07, + "loss": 0.6613, + "mean_token_accuracy": 0.8008755445480347, + "num_tokens": 13190741.0, + "step": 348 + }, + { + "epoch": 0.04439638722808803, + "grad_norm": 3.8783509731292725, + "learning_rate": 1.4752013565069942e-07, + "loss": 0.5604, + "mean_token_accuracy": 0.8259324431419373, + "num_tokens": 13235447.0, + "step": 349 + }, + { + "epoch": 0.04452359750667854, + "grad_norm": 3.7936959266662598, + "learning_rate": 1.4794404408647732e-07, + "loss": 0.6158, + "mean_token_accuracy": 0.8084282875061035, + "num_tokens": 13279256.0, + "step": 350 + }, + { + "epoch": 0.04465080778526905, + "grad_norm": 4.212865352630615, + "learning_rate": 1.4836795252225516e-07, + "loss": 0.6359, + "mean_token_accuracy": 0.799461841583252, + "num_tokens": 13314796.0, + "step": 351 + }, + { + "epoch": 0.04477801806385956, + "grad_norm": 3.371389389038086, + "learning_rate": 1.4879186095803306e-07, + "loss": 0.5648, + "mean_token_accuracy": 0.8258641958236694, + "num_tokens": 13355479.0, + "step": 352 + }, + { + "epoch": 0.04490522834245007, + "grad_norm": 3.5192551612854004, + "learning_rate": 1.492157693938109e-07, + "loss": 0.6347, + "mean_token_accuracy": 0.8087601065635681, + "num_tokens": 13396185.0, + "step": 353 + }, + { + "epoch": 0.04503243862104058, + "grad_norm": 3.039015054702759, + "learning_rate": 1.496396778295888e-07, + "loss": 0.5951, + "mean_token_accuracy": 0.8139158487319946, + "num_tokens": 13445219.0, + "step": 354 + }, + { + "epoch": 0.04515964889963109, + "grad_norm": 3.254154920578003, + "learning_rate": 1.5006358626536665e-07, + "loss": 0.5905, + "mean_token_accuracy": 0.8174682855606079, + "num_tokens": 13483537.0, + "step": 355 + }, + { + "epoch": 0.0452868591782216, + "grad_norm": 4.532753944396973, + "learning_rate": 1.5048749470114455e-07, + "loss": 0.5885, + "mean_token_accuracy": 0.8127079010009766, + "num_tokens": 13517969.0, + "step": 356 + }, + { + "epoch": 0.04541406945681211, + "grad_norm": 4.850791931152344, + "learning_rate": 1.509114031369224e-07, + "loss": 0.5648, + "mean_token_accuracy": 0.8242654800415039, + "num_tokens": 13553082.0, + "step": 357 + }, + { + "epoch": 0.04554127973540262, + "grad_norm": 3.553295135498047, + "learning_rate": 1.513353115727003e-07, + "loss": 0.6491, + "mean_token_accuracy": 0.8016498684883118, + "num_tokens": 13593189.0, + "step": 358 + }, + { + "epoch": 0.04566849001399313, + "grad_norm": 4.7967376708984375, + "learning_rate": 1.5175922000847814e-07, + "loss": 0.6471, + "mean_token_accuracy": 0.8041332960128784, + "num_tokens": 13628318.0, + "step": 359 + }, + { + "epoch": 0.04579570029258364, + "grad_norm": 4.034188747406006, + "learning_rate": 1.5218312844425604e-07, + "loss": 0.5939, + "mean_token_accuracy": 0.8087543845176697, + "num_tokens": 13662621.0, + "step": 360 + }, + { + "epoch": 0.045922910571174154, + "grad_norm": 3.8838446140289307, + "learning_rate": 1.526070368800339e-07, + "loss": 0.6017, + "mean_token_accuracy": 0.8115798234939575, + "num_tokens": 13700329.0, + "step": 361 + }, + { + "epoch": 0.04605012084976466, + "grad_norm": 4.700798034667969, + "learning_rate": 1.530309453158118e-07, + "loss": 0.6388, + "mean_token_accuracy": 0.800994336605072, + "num_tokens": 13738916.0, + "step": 362 + }, + { + "epoch": 0.04617733112835517, + "grad_norm": 3.7173008918762207, + "learning_rate": 1.5345485375158963e-07, + "loss": 0.6261, + "mean_token_accuracy": 0.8019958138465881, + "num_tokens": 13776971.0, + "step": 363 + }, + { + "epoch": 0.04630454140694568, + "grad_norm": 3.798983335494995, + "learning_rate": 1.5387876218736753e-07, + "loss": 0.593, + "mean_token_accuracy": 0.8161992430686951, + "num_tokens": 13815144.0, + "step": 364 + }, + { + "epoch": 0.04643175168553619, + "grad_norm": 3.895869255065918, + "learning_rate": 1.5430267062314538e-07, + "loss": 0.5749, + "mean_token_accuracy": 0.8216112852096558, + "num_tokens": 13855000.0, + "step": 365 + }, + { + "epoch": 0.0465589619641267, + "grad_norm": 3.871009588241577, + "learning_rate": 1.5472657905892328e-07, + "loss": 0.6826, + "mean_token_accuracy": 0.7968623042106628, + "num_tokens": 13892829.0, + "step": 366 + }, + { + "epoch": 0.04668617224271721, + "grad_norm": 3.943296194076538, + "learning_rate": 1.5515048749470113e-07, + "loss": 0.5641, + "mean_token_accuracy": 0.8238402605056763, + "num_tokens": 13936045.0, + "step": 367 + }, + { + "epoch": 0.04681338252130772, + "grad_norm": 4.629212856292725, + "learning_rate": 1.55574395930479e-07, + "loss": 0.6129, + "mean_token_accuracy": 0.8104532957077026, + "num_tokens": 13977259.0, + "step": 368 + }, + { + "epoch": 0.046940592799898234, + "grad_norm": 4.952766418457031, + "learning_rate": 1.5599830436625687e-07, + "loss": 0.5361, + "mean_token_accuracy": 0.8322440385818481, + "num_tokens": 14008670.0, + "step": 369 + }, + { + "epoch": 0.04706780307848874, + "grad_norm": 4.084794044494629, + "learning_rate": 1.5642221280203474e-07, + "loss": 0.5682, + "mean_token_accuracy": 0.8250154256820679, + "num_tokens": 14041634.0, + "step": 370 + }, + { + "epoch": 0.04719501335707925, + "grad_norm": 4.67825174331665, + "learning_rate": 1.5684612123781262e-07, + "loss": 0.5755, + "mean_token_accuracy": 0.820676326751709, + "num_tokens": 14076944.0, + "step": 371 + }, + { + "epoch": 0.04732222363566976, + "grad_norm": 4.945616245269775, + "learning_rate": 1.572700296735905e-07, + "loss": 0.6397, + "mean_token_accuracy": 0.8028733730316162, + "num_tokens": 14109665.0, + "step": 372 + }, + { + "epoch": 0.04744943391426027, + "grad_norm": 3.8300271034240723, + "learning_rate": 1.576939381093684e-07, + "loss": 0.5762, + "mean_token_accuracy": 0.8192226886749268, + "num_tokens": 14154642.0, + "step": 373 + }, + { + "epoch": 0.047576644192850785, + "grad_norm": 3.21555233001709, + "learning_rate": 1.5811784654514623e-07, + "loss": 0.6354, + "mean_token_accuracy": 0.8070797920227051, + "num_tokens": 14194043.0, + "step": 374 + }, + { + "epoch": 0.04770385447144129, + "grad_norm": 3.436354637145996, + "learning_rate": 1.5854175498092413e-07, + "loss": 0.604, + "mean_token_accuracy": 0.811208188533783, + "num_tokens": 14230821.0, + "step": 375 + }, + { + "epoch": 0.0478310647500318, + "grad_norm": 5.082107067108154, + "learning_rate": 1.5896566341670198e-07, + "loss": 0.6204, + "mean_token_accuracy": 0.8073912858963013, + "num_tokens": 14261791.0, + "step": 376 + }, + { + "epoch": 0.047958275028622314, + "grad_norm": 5.083056926727295, + "learning_rate": 1.5938957185247988e-07, + "loss": 0.6352, + "mean_token_accuracy": 0.8052966594696045, + "num_tokens": 14295778.0, + "step": 377 + }, + { + "epoch": 0.04808548530721282, + "grad_norm": 3.052185297012329, + "learning_rate": 1.5981348028825772e-07, + "loss": 0.6194, + "mean_token_accuracy": 0.8037083148956299, + "num_tokens": 14335381.0, + "step": 378 + }, + { + "epoch": 0.048212695585803336, + "grad_norm": 4.022050380706787, + "learning_rate": 1.6023738872403562e-07, + "loss": 0.6008, + "mean_token_accuracy": 0.8112170100212097, + "num_tokens": 14368939.0, + "step": 379 + }, + { + "epoch": 0.048339905864393844, + "grad_norm": 3.514899730682373, + "learning_rate": 1.6066129715981347e-07, + "loss": 0.5925, + "mean_token_accuracy": 0.8131148815155029, + "num_tokens": 14401725.0, + "step": 380 + }, + { + "epoch": 0.04846711614298435, + "grad_norm": 3.756641387939453, + "learning_rate": 1.6108520559559137e-07, + "loss": 0.5849, + "mean_token_accuracy": 0.8142465353012085, + "num_tokens": 14446175.0, + "step": 381 + }, + { + "epoch": 0.048594326421574865, + "grad_norm": 4.052481651306152, + "learning_rate": 1.6150911403136921e-07, + "loss": 0.6114, + "mean_token_accuracy": 0.8102416396141052, + "num_tokens": 14481681.0, + "step": 382 + }, + { + "epoch": 0.04872153670016537, + "grad_norm": 4.841648578643799, + "learning_rate": 1.619330224671471e-07, + "loss": 0.5736, + "mean_token_accuracy": 0.8195956945419312, + "num_tokens": 14518201.0, + "step": 383 + }, + { + "epoch": 0.04884874697875588, + "grad_norm": 3.3288817405700684, + "learning_rate": 1.6235693090292496e-07, + "loss": 0.5883, + "mean_token_accuracy": 0.8182724714279175, + "num_tokens": 14558445.0, + "step": 384 + }, + { + "epoch": 0.048975957257346395, + "grad_norm": 4.006499290466309, + "learning_rate": 1.6278083933870286e-07, + "loss": 0.5935, + "mean_token_accuracy": 0.8144721388816833, + "num_tokens": 14593434.0, + "step": 385 + }, + { + "epoch": 0.0491031675359369, + "grad_norm": 4.70747184753418, + "learning_rate": 1.632047477744807e-07, + "loss": 0.543, + "mean_token_accuracy": 0.8331979513168335, + "num_tokens": 14629327.0, + "step": 386 + }, + { + "epoch": 0.049230377814527417, + "grad_norm": 4.130184650421143, + "learning_rate": 1.6362865621025858e-07, + "loss": 0.6049, + "mean_token_accuracy": 0.8184199929237366, + "num_tokens": 14670233.0, + "step": 387 + }, + { + "epoch": 0.049357588093117924, + "grad_norm": 3.597726821899414, + "learning_rate": 1.6405256464603645e-07, + "loss": 0.5374, + "mean_token_accuracy": 0.8295527696609497, + "num_tokens": 14704517.0, + "step": 388 + }, + { + "epoch": 0.04948479837170843, + "grad_norm": 3.8201541900634766, + "learning_rate": 1.6447647308181432e-07, + "loss": 0.5932, + "mean_token_accuracy": 0.815892219543457, + "num_tokens": 14745061.0, + "step": 389 + }, + { + "epoch": 0.049612008650298946, + "grad_norm": 4.337593078613281, + "learning_rate": 1.649003815175922e-07, + "loss": 0.6043, + "mean_token_accuracy": 0.8140489459037781, + "num_tokens": 14787453.0, + "step": 390 + }, + { + "epoch": 0.04973921892888945, + "grad_norm": 3.7347962856292725, + "learning_rate": 1.6532428995337007e-07, + "loss": 0.5865, + "mean_token_accuracy": 0.8200435638427734, + "num_tokens": 14832212.0, + "step": 391 + }, + { + "epoch": 0.04986642920747997, + "grad_norm": 3.6427876949310303, + "learning_rate": 1.6574819838914794e-07, + "loss": 0.5702, + "mean_token_accuracy": 0.8246189951896667, + "num_tokens": 14872469.0, + "step": 392 + }, + { + "epoch": 0.049993639486070475, + "grad_norm": 4.631188869476318, + "learning_rate": 1.661721068249258e-07, + "loss": 0.5883, + "mean_token_accuracy": 0.8186418414115906, + "num_tokens": 14907358.0, + "step": 393 + }, + { + "epoch": 0.05012084976466098, + "grad_norm": 5.22842264175415, + "learning_rate": 1.6659601526070368e-07, + "loss": 0.6046, + "mean_token_accuracy": 0.810850203037262, + "num_tokens": 14941759.0, + "step": 394 + }, + { + "epoch": 0.0502480600432515, + "grad_norm": 4.277962684631348, + "learning_rate": 1.6701992369648156e-07, + "loss": 0.5691, + "mean_token_accuracy": 0.8218345642089844, + "num_tokens": 14979802.0, + "step": 395 + }, + { + "epoch": 0.050375270321842004, + "grad_norm": 3.346482038497925, + "learning_rate": 1.6744383213225943e-07, + "loss": 0.626, + "mean_token_accuracy": 0.8092319369316101, + "num_tokens": 15018471.0, + "step": 396 + }, + { + "epoch": 0.05050248060043251, + "grad_norm": 3.9617695808410645, + "learning_rate": 1.678677405680373e-07, + "loss": 0.6187, + "mean_token_accuracy": 0.8068166971206665, + "num_tokens": 15055124.0, + "step": 397 + }, + { + "epoch": 0.050629690879023026, + "grad_norm": 3.870638370513916, + "learning_rate": 1.6829164900381518e-07, + "loss": 0.5633, + "mean_token_accuracy": 0.8273136019706726, + "num_tokens": 15095774.0, + "step": 398 + }, + { + "epoch": 0.05075690115761353, + "grad_norm": 2.901545524597168, + "learning_rate": 1.6871555743959305e-07, + "loss": 0.6271, + "mean_token_accuracy": 0.8060747981071472, + "num_tokens": 15135526.0, + "step": 399 + }, + { + "epoch": 0.05088411143620405, + "grad_norm": 3.099465847015381, + "learning_rate": 1.6913946587537092e-07, + "loss": 0.5887, + "mean_token_accuracy": 0.8167208433151245, + "num_tokens": 15173234.0, + "step": 400 + }, + { + "epoch": 0.051011321714794555, + "grad_norm": 3.558251142501831, + "learning_rate": 1.695633743111488e-07, + "loss": 0.6219, + "mean_token_accuracy": 0.806628942489624, + "num_tokens": 15212106.0, + "step": 401 + }, + { + "epoch": 0.05113853199338506, + "grad_norm": 3.187378168106079, + "learning_rate": 1.6998728274692667e-07, + "loss": 0.6253, + "mean_token_accuracy": 0.809200644493103, + "num_tokens": 15251089.0, + "step": 402 + }, + { + "epoch": 0.05126574227197558, + "grad_norm": 4.020248889923096, + "learning_rate": 1.7041119118270454e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.8233287334442139, + "num_tokens": 15285022.0, + "step": 403 + }, + { + "epoch": 0.051392952550566084, + "grad_norm": 3.2700181007385254, + "learning_rate": 1.7083509961848238e-07, + "loss": 0.599, + "mean_token_accuracy": 0.8223697543144226, + "num_tokens": 15325890.0, + "step": 404 + }, + { + "epoch": 0.0515201628291566, + "grad_norm": 2.904707193374634, + "learning_rate": 1.7125900805426028e-07, + "loss": 0.6206, + "mean_token_accuracy": 0.8055347204208374, + "num_tokens": 15365035.0, + "step": 405 + }, + { + "epoch": 0.051647373107747106, + "grad_norm": 3.004565715789795, + "learning_rate": 1.7168291649003813e-07, + "loss": 0.6337, + "mean_token_accuracy": 0.8056159615516663, + "num_tokens": 15402494.0, + "step": 406 + }, + { + "epoch": 0.051774583386337614, + "grad_norm": 2.9879844188690186, + "learning_rate": 1.7210682492581603e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8416544198989868, + "num_tokens": 15439344.0, + "step": 407 + }, + { + "epoch": 0.05190179366492813, + "grad_norm": 5.405421257019043, + "learning_rate": 1.7253073336159387e-07, + "loss": 0.6037, + "mean_token_accuracy": 0.8103679418563843, + "num_tokens": 15476601.0, + "step": 408 + }, + { + "epoch": 0.052029003943518635, + "grad_norm": 3.9631431102752686, + "learning_rate": 1.7295464179737177e-07, + "loss": 0.5465, + "mean_token_accuracy": 0.8285322189331055, + "num_tokens": 15511266.0, + "step": 409 + }, + { + "epoch": 0.05215621422210915, + "grad_norm": 3.164821147918701, + "learning_rate": 1.7337855023314962e-07, + "loss": 0.5542, + "mean_token_accuracy": 0.8264356851577759, + "num_tokens": 15550499.0, + "step": 410 + }, + { + "epoch": 0.05228342450069966, + "grad_norm": 3.631258487701416, + "learning_rate": 1.7380245866892752e-07, + "loss": 0.5569, + "mean_token_accuracy": 0.8275725841522217, + "num_tokens": 15581563.0, + "step": 411 + }, + { + "epoch": 0.052410634779290165, + "grad_norm": 3.512674331665039, + "learning_rate": 1.7422636710470536e-07, + "loss": 0.5621, + "mean_token_accuracy": 0.8258905410766602, + "num_tokens": 15615210.0, + "step": 412 + }, + { + "epoch": 0.05253784505788068, + "grad_norm": 3.455085277557373, + "learning_rate": 1.7465027554048326e-07, + "loss": 0.5424, + "mean_token_accuracy": 0.8271070718765259, + "num_tokens": 15654592.0, + "step": 413 + }, + { + "epoch": 0.05266505533647119, + "grad_norm": 3.3248960971832275, + "learning_rate": 1.750741839762611e-07, + "loss": 0.5238, + "mean_token_accuracy": 0.8318590521812439, + "num_tokens": 15691853.0, + "step": 414 + }, + { + "epoch": 0.052792265615061694, + "grad_norm": 3.0636887550354004, + "learning_rate": 1.75498092412039e-07, + "loss": 0.5647, + "mean_token_accuracy": 0.823588490486145, + "num_tokens": 15729559.0, + "step": 415 + }, + { + "epoch": 0.05291947589365221, + "grad_norm": 4.478031158447266, + "learning_rate": 1.7592200084781686e-07, + "loss": 0.6215, + "mean_token_accuracy": 0.8028355836868286, + "num_tokens": 15759121.0, + "step": 416 + }, + { + "epoch": 0.053046686172242716, + "grad_norm": 3.0120701789855957, + "learning_rate": 1.7634590928359475e-07, + "loss": 0.5607, + "mean_token_accuracy": 0.8276969194412231, + "num_tokens": 15801636.0, + "step": 417 + }, + { + "epoch": 0.05317389645083323, + "grad_norm": 3.544715166091919, + "learning_rate": 1.767698177193726e-07, + "loss": 0.5826, + "mean_token_accuracy": 0.8213626146316528, + "num_tokens": 15837444.0, + "step": 418 + }, + { + "epoch": 0.05330110672942374, + "grad_norm": 3.2690322399139404, + "learning_rate": 1.771937261551505e-07, + "loss": 0.5293, + "mean_token_accuracy": 0.8330377340316772, + "num_tokens": 15879376.0, + "step": 419 + }, + { + "epoch": 0.053428317008014245, + "grad_norm": 3.2425875663757324, + "learning_rate": 1.7761763459092835e-07, + "loss": 0.675, + "mean_token_accuracy": 0.7919967174530029, + "num_tokens": 15922382.0, + "step": 420 + }, + { + "epoch": 0.05355552728660476, + "grad_norm": 3.094996690750122, + "learning_rate": 1.7804154302670624e-07, + "loss": 0.6034, + "mean_token_accuracy": 0.8144006133079529, + "num_tokens": 15958592.0, + "step": 421 + }, + { + "epoch": 0.05368273756519527, + "grad_norm": 2.9492154121398926, + "learning_rate": 1.784654514624841e-07, + "loss": 0.528, + "mean_token_accuracy": 0.8312360048294067, + "num_tokens": 15993186.0, + "step": 422 + }, + { + "epoch": 0.05380994784378578, + "grad_norm": 2.8998160362243652, + "learning_rate": 1.7888935989826196e-07, + "loss": 0.6064, + "mean_token_accuracy": 0.8121418952941895, + "num_tokens": 16029210.0, + "step": 423 + }, + { + "epoch": 0.05393715812237629, + "grad_norm": 3.3051319122314453, + "learning_rate": 1.7931326833403984e-07, + "loss": 0.5877, + "mean_token_accuracy": 0.8163853883743286, + "num_tokens": 16068031.0, + "step": 424 + }, + { + "epoch": 0.054064368400966796, + "grad_norm": 2.8192224502563477, + "learning_rate": 1.797371767698177e-07, + "loss": 0.6047, + "mean_token_accuracy": 0.8123794794082642, + "num_tokens": 16109232.0, + "step": 425 + }, + { + "epoch": 0.05419157867955731, + "grad_norm": 2.6331820487976074, + "learning_rate": 1.8016108520559558e-07, + "loss": 0.5357, + "mean_token_accuracy": 0.8312861323356628, + "num_tokens": 16154189.0, + "step": 426 + }, + { + "epoch": 0.05431878895814782, + "grad_norm": 2.755005359649658, + "learning_rate": 1.8058499364137345e-07, + "loss": 0.5926, + "mean_token_accuracy": 0.8206681609153748, + "num_tokens": 16196717.0, + "step": 427 + }, + { + "epoch": 0.054445999236738325, + "grad_norm": 3.0256409645080566, + "learning_rate": 1.8100890207715133e-07, + "loss": 0.5582, + "mean_token_accuracy": 0.8251327276229858, + "num_tokens": 16238401.0, + "step": 428 + }, + { + "epoch": 0.05457320951532884, + "grad_norm": 2.701812982559204, + "learning_rate": 1.814328105129292e-07, + "loss": 0.6249, + "mean_token_accuracy": 0.8048228025436401, + "num_tokens": 16283766.0, + "step": 429 + }, + { + "epoch": 0.05470041979391935, + "grad_norm": 3.4457836151123047, + "learning_rate": 1.8185671894870707e-07, + "loss": 0.6355, + "mean_token_accuracy": 0.8045220971107483, + "num_tokens": 16325655.0, + "step": 430 + }, + { + "epoch": 0.05482763007250986, + "grad_norm": 3.038896083831787, + "learning_rate": 1.8228062738448494e-07, + "loss": 0.5857, + "mean_token_accuracy": 0.8219397068023682, + "num_tokens": 16366306.0, + "step": 431 + }, + { + "epoch": 0.05495484035110037, + "grad_norm": 3.5526185035705566, + "learning_rate": 1.8270453582026282e-07, + "loss": 0.568, + "mean_token_accuracy": 0.8214554786682129, + "num_tokens": 16400964.0, + "step": 432 + }, + { + "epoch": 0.055082050629690876, + "grad_norm": 2.9802777767181396, + "learning_rate": 1.831284442560407e-07, + "loss": 0.566, + "mean_token_accuracy": 0.8226672410964966, + "num_tokens": 16440022.0, + "step": 433 + }, + { + "epoch": 0.05520926090828139, + "grad_norm": 3.02909517288208, + "learning_rate": 1.8355235269181856e-07, + "loss": 0.5646, + "mean_token_accuracy": 0.8211393356323242, + "num_tokens": 16478233.0, + "step": 434 + }, + { + "epoch": 0.0553364711868719, + "grad_norm": 2.4602813720703125, + "learning_rate": 1.8397626112759643e-07, + "loss": 0.5415, + "mean_token_accuracy": 0.8267821669578552, + "num_tokens": 16514879.0, + "step": 435 + }, + { + "epoch": 0.05546368146546241, + "grad_norm": 2.897669792175293, + "learning_rate": 1.844001695633743e-07, + "loss": 0.5892, + "mean_token_accuracy": 0.815633237361908, + "num_tokens": 16553959.0, + "step": 436 + }, + { + "epoch": 0.05559089174405292, + "grad_norm": 2.652308702468872, + "learning_rate": 1.8482407799915218e-07, + "loss": 0.5415, + "mean_token_accuracy": 0.8297119736671448, + "num_tokens": 16598683.0, + "step": 437 + }, + { + "epoch": 0.05571810202264343, + "grad_norm": 2.889627695083618, + "learning_rate": 1.8524798643493005e-07, + "loss": 0.5076, + "mean_token_accuracy": 0.8371495604515076, + "num_tokens": 16634224.0, + "step": 438 + }, + { + "epoch": 0.05584531230123394, + "grad_norm": 2.717439651489258, + "learning_rate": 1.8567189487070792e-07, + "loss": 0.6236, + "mean_token_accuracy": 0.8045077919960022, + "num_tokens": 16675660.0, + "step": 439 + }, + { + "epoch": 0.05597252257982445, + "grad_norm": 2.439791440963745, + "learning_rate": 1.8609580330648577e-07, + "loss": 0.5101, + "mean_token_accuracy": 0.8403383493423462, + "num_tokens": 16719203.0, + "step": 440 + }, + { + "epoch": 0.05609973285841496, + "grad_norm": 2.791940212249756, + "learning_rate": 1.8651971174226367e-07, + "loss": 0.6242, + "mean_token_accuracy": 0.8049404621124268, + "num_tokens": 16762304.0, + "step": 441 + }, + { + "epoch": 0.05622694313700547, + "grad_norm": 2.73774790763855, + "learning_rate": 1.8694362017804152e-07, + "loss": 0.507, + "mean_token_accuracy": 0.8381928205490112, + "num_tokens": 16801883.0, + "step": 442 + }, + { + "epoch": 0.05635415341559598, + "grad_norm": 3.1410770416259766, + "learning_rate": 1.8736752861381941e-07, + "loss": 0.6741, + "mean_token_accuracy": 0.7952201962471008, + "num_tokens": 16837745.0, + "step": 443 + }, + { + "epoch": 0.05648136369418649, + "grad_norm": 2.695396661758423, + "learning_rate": 1.8779143704959726e-07, + "loss": 0.5728, + "mean_token_accuracy": 0.8191642761230469, + "num_tokens": 16883476.0, + "step": 444 + }, + { + "epoch": 0.056608573972777, + "grad_norm": 2.543731927871704, + "learning_rate": 1.8821534548537516e-07, + "loss": 0.5384, + "mean_token_accuracy": 0.8301531672477722, + "num_tokens": 16927870.0, + "step": 445 + }, + { + "epoch": 0.05673578425136751, + "grad_norm": 2.5884573459625244, + "learning_rate": 1.88639253921153e-07, + "loss": 0.53, + "mean_token_accuracy": 0.831587016582489, + "num_tokens": 16966498.0, + "step": 446 + }, + { + "epoch": 0.05686299452995802, + "grad_norm": 3.158874750137329, + "learning_rate": 1.890631623569309e-07, + "loss": 0.5867, + "mean_token_accuracy": 0.8146483898162842, + "num_tokens": 17005706.0, + "step": 447 + }, + { + "epoch": 0.05699020480854853, + "grad_norm": 2.514328956604004, + "learning_rate": 1.8948707079270875e-07, + "loss": 0.5219, + "mean_token_accuracy": 0.837230920791626, + "num_tokens": 17045960.0, + "step": 448 + }, + { + "epoch": 0.057117415087139044, + "grad_norm": 2.655458927154541, + "learning_rate": 1.8991097922848665e-07, + "loss": 0.5919, + "mean_token_accuracy": 0.813300371170044, + "num_tokens": 17085731.0, + "step": 449 + }, + { + "epoch": 0.05724462536572955, + "grad_norm": 2.9526026248931885, + "learning_rate": 1.903348876642645e-07, + "loss": 0.5904, + "mean_token_accuracy": 0.8141226768493652, + "num_tokens": 17122345.0, + "step": 450 + }, + { + "epoch": 0.05737183564432006, + "grad_norm": 3.032113552093506, + "learning_rate": 1.907587961000424e-07, + "loss": 0.5477, + "mean_token_accuracy": 0.8261790871620178, + "num_tokens": 17162330.0, + "step": 451 + }, + { + "epoch": 0.05749904592291057, + "grad_norm": 3.1817243099212646, + "learning_rate": 1.9118270453582024e-07, + "loss": 0.6144, + "mean_token_accuracy": 0.8112854957580566, + "num_tokens": 17202389.0, + "step": 452 + }, + { + "epoch": 0.05762625620150108, + "grad_norm": 2.794623851776123, + "learning_rate": 1.9160661297159814e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.8418498635292053, + "num_tokens": 17234065.0, + "step": 453 + }, + { + "epoch": 0.057753466480091595, + "grad_norm": 2.998600482940674, + "learning_rate": 1.9203052140737599e-07, + "loss": 0.5885, + "mean_token_accuracy": 0.8181188106536865, + "num_tokens": 17272741.0, + "step": 454 + }, + { + "epoch": 0.0578806767586821, + "grad_norm": 3.560521125793457, + "learning_rate": 1.9245442984315389e-07, + "loss": 0.568, + "mean_token_accuracy": 0.8187311887741089, + "num_tokens": 17302606.0, + "step": 455 + }, + { + "epoch": 0.05800788703727261, + "grad_norm": 2.8513238430023193, + "learning_rate": 1.9287833827893173e-07, + "loss": 0.5511, + "mean_token_accuracy": 0.8290493488311768, + "num_tokens": 17342740.0, + "step": 456 + }, + { + "epoch": 0.058135097315863124, + "grad_norm": 2.7039473056793213, + "learning_rate": 1.9330224671470963e-07, + "loss": 0.5686, + "mean_token_accuracy": 0.8210228681564331, + "num_tokens": 17386660.0, + "step": 457 + }, + { + "epoch": 0.05826230759445363, + "grad_norm": 2.7800655364990234, + "learning_rate": 1.9372615515048748e-07, + "loss": 0.5608, + "mean_token_accuracy": 0.8203668594360352, + "num_tokens": 17428339.0, + "step": 458 + }, + { + "epoch": 0.05838951787304414, + "grad_norm": 2.7611308097839355, + "learning_rate": 1.9415006358626535e-07, + "loss": 0.5273, + "mean_token_accuracy": 0.8308548927307129, + "num_tokens": 17465164.0, + "step": 459 + }, + { + "epoch": 0.05851672815163465, + "grad_norm": 2.6516225337982178, + "learning_rate": 1.9457397202204322e-07, + "loss": 0.6177, + "mean_token_accuracy": 0.8056703805923462, + "num_tokens": 17505233.0, + "step": 460 + }, + { + "epoch": 0.05864393843022516, + "grad_norm": 2.543174982070923, + "learning_rate": 1.949978804578211e-07, + "loss": 0.5963, + "mean_token_accuracy": 0.8163623809814453, + "num_tokens": 17544386.0, + "step": 461 + }, + { + "epoch": 0.058771148708815675, + "grad_norm": 2.6762568950653076, + "learning_rate": 1.9542178889359897e-07, + "loss": 0.5557, + "mean_token_accuracy": 0.8249301314353943, + "num_tokens": 17583917.0, + "step": 462 + }, + { + "epoch": 0.05889835898740618, + "grad_norm": 2.7691264152526855, + "learning_rate": 1.9584569732937684e-07, + "loss": 0.556, + "mean_token_accuracy": 0.8254508376121521, + "num_tokens": 17623308.0, + "step": 463 + }, + { + "epoch": 0.05902556926599669, + "grad_norm": 2.993677854537964, + "learning_rate": 1.962696057651547e-07, + "loss": 0.5687, + "mean_token_accuracy": 0.8179868459701538, + "num_tokens": 17663449.0, + "step": 464 + }, + { + "epoch": 0.059152779544587204, + "grad_norm": 2.491999626159668, + "learning_rate": 1.9669351420093258e-07, + "loss": 0.5691, + "mean_token_accuracy": 0.8214432597160339, + "num_tokens": 17698778.0, + "step": 465 + }, + { + "epoch": 0.05927998982317771, + "grad_norm": 2.698652744293213, + "learning_rate": 1.9711742263671046e-07, + "loss": 0.5042, + "mean_token_accuracy": 0.8385706543922424, + "num_tokens": 17736003.0, + "step": 466 + }, + { + "epoch": 0.059407200101768226, + "grad_norm": 2.5410923957824707, + "learning_rate": 1.9754133107248833e-07, + "loss": 0.6043, + "mean_token_accuracy": 0.8097003698348999, + "num_tokens": 17776586.0, + "step": 467 + }, + { + "epoch": 0.059534410380358734, + "grad_norm": 2.6073949337005615, + "learning_rate": 1.979652395082662e-07, + "loss": 0.5572, + "mean_token_accuracy": 0.8219572305679321, + "num_tokens": 17815132.0, + "step": 468 + }, + { + "epoch": 0.05966162065894924, + "grad_norm": 2.1969358921051025, + "learning_rate": 1.9838914794404408e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.8365013599395752, + "num_tokens": 17855130.0, + "step": 469 + }, + { + "epoch": 0.059788830937539755, + "grad_norm": 2.7424702644348145, + "learning_rate": 1.9881305637982195e-07, + "loss": 0.5495, + "mean_token_accuracy": 0.8268136978149414, + "num_tokens": 17892337.0, + "step": 470 + }, + { + "epoch": 0.05991604121613026, + "grad_norm": 2.578071355819702, + "learning_rate": 1.9923696481559982e-07, + "loss": 0.5542, + "mean_token_accuracy": 0.824364185333252, + "num_tokens": 17930291.0, + "step": 471 + }, + { + "epoch": 0.06004325149472077, + "grad_norm": 2.938169002532959, + "learning_rate": 1.996608732513777e-07, + "loss": 0.5859, + "mean_token_accuracy": 0.8249887228012085, + "num_tokens": 17965246.0, + "step": 472 + }, + { + "epoch": 0.060170461773311285, + "grad_norm": 2.6804935932159424, + "learning_rate": 2.0008478168715557e-07, + "loss": 0.561, + "mean_token_accuracy": 0.821133017539978, + "num_tokens": 18000503.0, + "step": 473 + }, + { + "epoch": 0.06029767205190179, + "grad_norm": 2.1285479068756104, + "learning_rate": 2.0050869012293344e-07, + "loss": 0.5498, + "mean_token_accuracy": 0.8292447328567505, + "num_tokens": 18043225.0, + "step": 474 + }, + { + "epoch": 0.060424882330492306, + "grad_norm": 2.7257325649261475, + "learning_rate": 2.009325985587113e-07, + "loss": 0.5696, + "mean_token_accuracy": 0.8207398653030396, + "num_tokens": 18077481.0, + "step": 475 + }, + { + "epoch": 0.060552092609082814, + "grad_norm": 2.235269069671631, + "learning_rate": 2.0135650699448918e-07, + "loss": 0.5091, + "mean_token_accuracy": 0.8385105729103088, + "num_tokens": 18114172.0, + "step": 476 + }, + { + "epoch": 0.06067930288767332, + "grad_norm": 2.528400421142578, + "learning_rate": 2.0178041543026706e-07, + "loss": 0.4961, + "mean_token_accuracy": 0.8407115936279297, + "num_tokens": 18154108.0, + "step": 477 + }, + { + "epoch": 0.060806513166263836, + "grad_norm": 2.494795799255371, + "learning_rate": 2.022043238660449e-07, + "loss": 0.5628, + "mean_token_accuracy": 0.8225640654563904, + "num_tokens": 18194573.0, + "step": 478 + }, + { + "epoch": 0.06093372344485434, + "grad_norm": 2.4900975227355957, + "learning_rate": 2.026282323018228e-07, + "loss": 0.4938, + "mean_token_accuracy": 0.8420597910881042, + "num_tokens": 18229092.0, + "step": 479 + }, + { + "epoch": 0.06106093372344486, + "grad_norm": 2.654128074645996, + "learning_rate": 2.0305214073760065e-07, + "loss": 0.5719, + "mean_token_accuracy": 0.8152101039886475, + "num_tokens": 18264065.0, + "step": 480 + }, + { + "epoch": 0.061188144002035365, + "grad_norm": 1.9989625215530396, + "learning_rate": 2.0347604917337855e-07, + "loss": 0.5544, + "mean_token_accuracy": 0.8253297805786133, + "num_tokens": 18306141.0, + "step": 481 + }, + { + "epoch": 0.06131535428062587, + "grad_norm": 2.3775503635406494, + "learning_rate": 2.038999576091564e-07, + "loss": 0.5974, + "mean_token_accuracy": 0.8158704042434692, + "num_tokens": 18345864.0, + "step": 482 + }, + { + "epoch": 0.06144256455921639, + "grad_norm": 2.4494516849517822, + "learning_rate": 2.043238660449343e-07, + "loss": 0.5747, + "mean_token_accuracy": 0.8191155195236206, + "num_tokens": 18381456.0, + "step": 483 + }, + { + "epoch": 0.061569774837806894, + "grad_norm": 2.7241344451904297, + "learning_rate": 2.0474777448071214e-07, + "loss": 0.549, + "mean_token_accuracy": 0.828385591506958, + "num_tokens": 18421688.0, + "step": 484 + }, + { + "epoch": 0.0616969851163974, + "grad_norm": 2.41066312789917, + "learning_rate": 2.0517168291649004e-07, + "loss": 0.6096, + "mean_token_accuracy": 0.8049050569534302, + "num_tokens": 18463594.0, + "step": 485 + }, + { + "epoch": 0.061824195394987916, + "grad_norm": 2.348881959915161, + "learning_rate": 2.0559559135226788e-07, + "loss": 0.5697, + "mean_token_accuracy": 0.8219851851463318, + "num_tokens": 18497583.0, + "step": 486 + }, + { + "epoch": 0.06195140567357842, + "grad_norm": 2.406008243560791, + "learning_rate": 2.0601949978804578e-07, + "loss": 0.5012, + "mean_token_accuracy": 0.8386270999908447, + "num_tokens": 18533373.0, + "step": 487 + }, + { + "epoch": 0.06207861595216894, + "grad_norm": 2.1907925605773926, + "learning_rate": 2.0644340822382363e-07, + "loss": 0.5209, + "mean_token_accuracy": 0.8350424766540527, + "num_tokens": 18574509.0, + "step": 488 + }, + { + "epoch": 0.062205826230759445, + "grad_norm": 2.512517213821411, + "learning_rate": 2.0686731665960153e-07, + "loss": 0.5838, + "mean_token_accuracy": 0.8167192935943604, + "num_tokens": 18613165.0, + "step": 489 + }, + { + "epoch": 0.06233303650934995, + "grad_norm": 2.808905839920044, + "learning_rate": 2.0729122509537937e-07, + "loss": 0.5386, + "mean_token_accuracy": 0.8310776948928833, + "num_tokens": 18647127.0, + "step": 490 + }, + { + "epoch": 0.06246024678794047, + "grad_norm": 2.682114362716675, + "learning_rate": 2.0771513353115727e-07, + "loss": 0.5834, + "mean_token_accuracy": 0.8152129650115967, + "num_tokens": 18686247.0, + "step": 491 + }, + { + "epoch": 0.06258745706653097, + "grad_norm": 2.371685028076172, + "learning_rate": 2.0813904196693512e-07, + "loss": 0.5432, + "mean_token_accuracy": 0.8265278339385986, + "num_tokens": 18723156.0, + "step": 492 + }, + { + "epoch": 0.06271466734512149, + "grad_norm": 2.322053909301758, + "learning_rate": 2.0856295040271302e-07, + "loss": 0.5073, + "mean_token_accuracy": 0.8359487652778625, + "num_tokens": 18757189.0, + "step": 493 + }, + { + "epoch": 0.06284187762371199, + "grad_norm": 2.3748579025268555, + "learning_rate": 2.0898685883849086e-07, + "loss": 0.5466, + "mean_token_accuracy": 0.8289029598236084, + "num_tokens": 18797937.0, + "step": 494 + }, + { + "epoch": 0.0629690879023025, + "grad_norm": 2.707054853439331, + "learning_rate": 2.0941076727426874e-07, + "loss": 0.5586, + "mean_token_accuracy": 0.8228275179862976, + "num_tokens": 18830862.0, + "step": 495 + }, + { + "epoch": 0.06309629818089302, + "grad_norm": 2.2222211360931396, + "learning_rate": 2.098346757100466e-07, + "loss": 0.5086, + "mean_token_accuracy": 0.8384988307952881, + "num_tokens": 18872643.0, + "step": 496 + }, + { + "epoch": 0.06322350845948353, + "grad_norm": 2.4857890605926514, + "learning_rate": 2.1025858414582448e-07, + "loss": 0.5657, + "mean_token_accuracy": 0.8202787637710571, + "num_tokens": 18910152.0, + "step": 497 + }, + { + "epoch": 0.06335071873807403, + "grad_norm": 2.4112772941589355, + "learning_rate": 2.1068249258160238e-07, + "loss": 0.5814, + "mean_token_accuracy": 0.8153444528579712, + "num_tokens": 18949446.0, + "step": 498 + }, + { + "epoch": 0.06347792901666455, + "grad_norm": 2.183147668838501, + "learning_rate": 2.1110640101738023e-07, + "loss": 0.5516, + "mean_token_accuracy": 0.8217383623123169, + "num_tokens": 18985865.0, + "step": 499 + }, + { + "epoch": 0.06360513929525506, + "grad_norm": 2.3847806453704834, + "learning_rate": 2.1153030945315813e-07, + "loss": 0.5981, + "mean_token_accuracy": 0.8130621314048767, + "num_tokens": 19025670.0, + "step": 500 + }, + { + "epoch": 0.06373234957384556, + "grad_norm": 2.36454176902771, + "learning_rate": 2.1195421788893597e-07, + "loss": 0.5507, + "mean_token_accuracy": 0.8277010917663574, + "num_tokens": 19059307.0, + "step": 501 + }, + { + "epoch": 0.06385955985243608, + "grad_norm": 2.3920974731445312, + "learning_rate": 2.1237812632471387e-07, + "loss": 0.5184, + "mean_token_accuracy": 0.8320283889770508, + "num_tokens": 19090384.0, + "step": 502 + }, + { + "epoch": 0.06398677013102659, + "grad_norm": 2.3558452129364014, + "learning_rate": 2.1280203476049172e-07, + "loss": 0.542, + "mean_token_accuracy": 0.831689715385437, + "num_tokens": 19121512.0, + "step": 503 + }, + { + "epoch": 0.06411398040961709, + "grad_norm": 2.670606851577759, + "learning_rate": 2.1322594319626962e-07, + "loss": 0.5892, + "mean_token_accuracy": 0.8130568861961365, + "num_tokens": 19158522.0, + "step": 504 + }, + { + "epoch": 0.0642411906882076, + "grad_norm": 2.0279269218444824, + "learning_rate": 2.1364985163204746e-07, + "loss": 0.4877, + "mean_token_accuracy": 0.8478536605834961, + "num_tokens": 19197101.0, + "step": 505 + }, + { + "epoch": 0.06436840096679812, + "grad_norm": 2.245252847671509, + "learning_rate": 2.1407376006782536e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8441073894500732, + "num_tokens": 19235611.0, + "step": 506 + }, + { + "epoch": 0.06449561124538863, + "grad_norm": 2.605943202972412, + "learning_rate": 2.144976685036032e-07, + "loss": 0.5934, + "mean_token_accuracy": 0.8109220862388611, + "num_tokens": 19270904.0, + "step": 507 + }, + { + "epoch": 0.06462282152397913, + "grad_norm": 2.3213367462158203, + "learning_rate": 2.149215769393811e-07, + "loss": 0.5176, + "mean_token_accuracy": 0.8356955051422119, + "num_tokens": 19302067.0, + "step": 508 + }, + { + "epoch": 0.06475003180256965, + "grad_norm": 2.036773443222046, + "learning_rate": 2.1534548537515895e-07, + "loss": 0.5406, + "mean_token_accuracy": 0.8273273706436157, + "num_tokens": 19339250.0, + "step": 509 + }, + { + "epoch": 0.06487724208116016, + "grad_norm": 2.2592546939849854, + "learning_rate": 2.1576939381093685e-07, + "loss": 0.5242, + "mean_token_accuracy": 0.8315147757530212, + "num_tokens": 19378828.0, + "step": 510 + }, + { + "epoch": 0.06500445235975066, + "grad_norm": 1.9729957580566406, + "learning_rate": 2.161933022467147e-07, + "loss": 0.5048, + "mean_token_accuracy": 0.8392643928527832, + "num_tokens": 19422994.0, + "step": 511 + }, + { + "epoch": 0.06513166263834118, + "grad_norm": 2.094089984893799, + "learning_rate": 2.166172106824926e-07, + "loss": 0.51, + "mean_token_accuracy": 0.836568295955658, + "num_tokens": 19459828.0, + "step": 512 + }, + { + "epoch": 0.06525887291693169, + "grad_norm": 1.9476182460784912, + "learning_rate": 2.1704111911827044e-07, + "loss": 0.5173, + "mean_token_accuracy": 0.8380951881408691, + "num_tokens": 19502715.0, + "step": 513 + }, + { + "epoch": 0.0653860831955222, + "grad_norm": 1.986678123474121, + "learning_rate": 2.1746502755404831e-07, + "loss": 0.5109, + "mean_token_accuracy": 0.8368268609046936, + "num_tokens": 19545178.0, + "step": 514 + }, + { + "epoch": 0.06551329347411271, + "grad_norm": 1.9576746225357056, + "learning_rate": 2.178889359898262e-07, + "loss": 0.6295, + "mean_token_accuracy": 0.8005979061126709, + "num_tokens": 19589352.0, + "step": 515 + }, + { + "epoch": 0.06564050375270322, + "grad_norm": 2.1434435844421387, + "learning_rate": 2.1831284442560406e-07, + "loss": 0.5044, + "mean_token_accuracy": 0.8399555683135986, + "num_tokens": 19629697.0, + "step": 516 + }, + { + "epoch": 0.06576771403129372, + "grad_norm": 2.406107187271118, + "learning_rate": 2.1873675286138193e-07, + "loss": 0.5685, + "mean_token_accuracy": 0.8219794034957886, + "num_tokens": 19665866.0, + "step": 517 + }, + { + "epoch": 0.06589492430988424, + "grad_norm": 2.3119959831237793, + "learning_rate": 2.191606612971598e-07, + "loss": 0.5741, + "mean_token_accuracy": 0.8196880221366882, + "num_tokens": 19706677.0, + "step": 518 + }, + { + "epoch": 0.06602213458847475, + "grad_norm": 2.0880558490753174, + "learning_rate": 2.1958456973293768e-07, + "loss": 0.5967, + "mean_token_accuracy": 0.8124002814292908, + "num_tokens": 19748850.0, + "step": 519 + }, + { + "epoch": 0.06614934486706527, + "grad_norm": 2.213290214538574, + "learning_rate": 2.2000847816871555e-07, + "loss": 0.5228, + "mean_token_accuracy": 0.8322451114654541, + "num_tokens": 19783832.0, + "step": 520 + }, + { + "epoch": 0.06627655514565577, + "grad_norm": 2.0372588634490967, + "learning_rate": 2.2043238660449342e-07, + "loss": 0.5264, + "mean_token_accuracy": 0.8305181860923767, + "num_tokens": 19824771.0, + "step": 521 + }, + { + "epoch": 0.06640376542424628, + "grad_norm": 2.4234166145324707, + "learning_rate": 2.208562950402713e-07, + "loss": 0.5503, + "mean_token_accuracy": 0.8250680565834045, + "num_tokens": 19860165.0, + "step": 522 + }, + { + "epoch": 0.0665309757028368, + "grad_norm": 2.068150758743286, + "learning_rate": 2.2128020347604917e-07, + "loss": 0.5498, + "mean_token_accuracy": 0.8265795707702637, + "num_tokens": 19905863.0, + "step": 523 + }, + { + "epoch": 0.0666581859814273, + "grad_norm": 2.412576198577881, + "learning_rate": 2.2170411191182704e-07, + "loss": 0.5559, + "mean_token_accuracy": 0.8209078311920166, + "num_tokens": 19940222.0, + "step": 524 + }, + { + "epoch": 0.06678539626001781, + "grad_norm": 2.124454975128174, + "learning_rate": 2.221280203476049e-07, + "loss": 0.5286, + "mean_token_accuracy": 0.8294378519058228, + "num_tokens": 19980714.0, + "step": 525 + }, + { + "epoch": 0.06691260653860832, + "grad_norm": 2.3164708614349365, + "learning_rate": 2.2255192878338279e-07, + "loss": 0.5944, + "mean_token_accuracy": 0.8084850907325745, + "num_tokens": 20018190.0, + "step": 526 + }, + { + "epoch": 0.06703981681719882, + "grad_norm": 2.0586190223693848, + "learning_rate": 2.2297583721916066e-07, + "loss": 0.6245, + "mean_token_accuracy": 0.8048545122146606, + "num_tokens": 20056993.0, + "step": 527 + }, + { + "epoch": 0.06716702709578934, + "grad_norm": 2.1664271354675293, + "learning_rate": 2.2339974565493853e-07, + "loss": 0.565, + "mean_token_accuracy": 0.8245419263839722, + "num_tokens": 20100324.0, + "step": 528 + }, + { + "epoch": 0.06729423737437985, + "grad_norm": 2.0583441257476807, + "learning_rate": 2.238236540907164e-07, + "loss": 0.5613, + "mean_token_accuracy": 0.8208247423171997, + "num_tokens": 20136416.0, + "step": 529 + }, + { + "epoch": 0.06742144765297035, + "grad_norm": 1.9741287231445312, + "learning_rate": 2.2424756252649428e-07, + "loss": 0.5403, + "mean_token_accuracy": 0.8309600949287415, + "num_tokens": 20181632.0, + "step": 530 + }, + { + "epoch": 0.06754865793156087, + "grad_norm": 2.147036075592041, + "learning_rate": 2.2467147096227215e-07, + "loss": 0.5707, + "mean_token_accuracy": 0.8193100094795227, + "num_tokens": 20224169.0, + "step": 531 + }, + { + "epoch": 0.06767586821015138, + "grad_norm": 2.1028003692626953, + "learning_rate": 2.2509537939805002e-07, + "loss": 0.4954, + "mean_token_accuracy": 0.8439104557037354, + "num_tokens": 20261389.0, + "step": 532 + }, + { + "epoch": 0.0678030784887419, + "grad_norm": 2.212266683578491, + "learning_rate": 2.2551928783382787e-07, + "loss": 0.6004, + "mean_token_accuracy": 0.8134157657623291, + "num_tokens": 20297596.0, + "step": 533 + }, + { + "epoch": 0.0679302887673324, + "grad_norm": 2.137341260910034, + "learning_rate": 2.2594319626960577e-07, + "loss": 0.5784, + "mean_token_accuracy": 0.8163276314735413, + "num_tokens": 20335929.0, + "step": 534 + }, + { + "epoch": 0.06805749904592291, + "grad_norm": 2.1370227336883545, + "learning_rate": 2.263671047053836e-07, + "loss": 0.5664, + "mean_token_accuracy": 0.8235629200935364, + "num_tokens": 20376132.0, + "step": 535 + }, + { + "epoch": 0.06818470932451343, + "grad_norm": 2.2673490047454834, + "learning_rate": 2.267910131411615e-07, + "loss": 0.5571, + "mean_token_accuracy": 0.8241896629333496, + "num_tokens": 20411018.0, + "step": 536 + }, + { + "epoch": 0.06831191960310393, + "grad_norm": 2.3055503368377686, + "learning_rate": 2.2721492157693936e-07, + "loss": 0.5517, + "mean_token_accuracy": 0.8273155093193054, + "num_tokens": 20444633.0, + "step": 537 + }, + { + "epoch": 0.06843912988169444, + "grad_norm": 2.1232659816741943, + "learning_rate": 2.2763883001271726e-07, + "loss": 0.5407, + "mean_token_accuracy": 0.8257144093513489, + "num_tokens": 20481316.0, + "step": 538 + }, + { + "epoch": 0.06856634016028496, + "grad_norm": 2.0517194271087646, + "learning_rate": 2.280627384484951e-07, + "loss": 0.5376, + "mean_token_accuracy": 0.8286727070808411, + "num_tokens": 20517589.0, + "step": 539 + }, + { + "epoch": 0.06869355043887546, + "grad_norm": 2.249359369277954, + "learning_rate": 2.28486646884273e-07, + "loss": 0.5474, + "mean_token_accuracy": 0.8279723525047302, + "num_tokens": 20552863.0, + "step": 540 + }, + { + "epoch": 0.06882076071746597, + "grad_norm": 1.9174174070358276, + "learning_rate": 2.2891055532005085e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8402013182640076, + "num_tokens": 20590500.0, + "step": 541 + }, + { + "epoch": 0.06894797099605648, + "grad_norm": 2.204124689102173, + "learning_rate": 2.2933446375582875e-07, + "loss": 0.5927, + "mean_token_accuracy": 0.8123161792755127, + "num_tokens": 20630659.0, + "step": 542 + }, + { + "epoch": 0.06907518127464699, + "grad_norm": 2.0321204662323, + "learning_rate": 2.297583721916066e-07, + "loss": 0.4839, + "mean_token_accuracy": 0.8437764644622803, + "num_tokens": 20666515.0, + "step": 543 + }, + { + "epoch": 0.0692023915532375, + "grad_norm": 1.9227591753005981, + "learning_rate": 2.301822806273845e-07, + "loss": 0.5377, + "mean_token_accuracy": 0.8313806653022766, + "num_tokens": 20707319.0, + "step": 544 + }, + { + "epoch": 0.06932960183182801, + "grad_norm": 2.5895919799804688, + "learning_rate": 2.3060618906316234e-07, + "loss": 0.5081, + "mean_token_accuracy": 0.836538553237915, + "num_tokens": 20738190.0, + "step": 545 + }, + { + "epoch": 0.06945681211041853, + "grad_norm": 1.9364941120147705, + "learning_rate": 2.3103009749894024e-07, + "loss": 0.5097, + "mean_token_accuracy": 0.8358520865440369, + "num_tokens": 20780714.0, + "step": 546 + }, + { + "epoch": 0.06958402238900903, + "grad_norm": 2.1686582565307617, + "learning_rate": 2.3145400593471808e-07, + "loss": 0.5061, + "mean_token_accuracy": 0.8336120843887329, + "num_tokens": 20813000.0, + "step": 547 + }, + { + "epoch": 0.06971123266759954, + "grad_norm": 2.094104528427124, + "learning_rate": 2.3187791437049598e-07, + "loss": 0.5084, + "mean_token_accuracy": 0.8376812934875488, + "num_tokens": 20846234.0, + "step": 548 + }, + { + "epoch": 0.06983844294619006, + "grad_norm": 2.0717692375183105, + "learning_rate": 2.3230182280627383e-07, + "loss": 0.5824, + "mean_token_accuracy": 0.8169090151786804, + "num_tokens": 20887139.0, + "step": 549 + }, + { + "epoch": 0.06996565322478056, + "grad_norm": 2.0231974124908447, + "learning_rate": 2.327257312420517e-07, + "loss": 0.6382, + "mean_token_accuracy": 0.8075422048568726, + "num_tokens": 20924573.0, + "step": 550 + }, + { + "epoch": 0.07009286350337107, + "grad_norm": 2.413367986679077, + "learning_rate": 2.3314963967782957e-07, + "loss": 0.5191, + "mean_token_accuracy": 0.8338338136672974, + "num_tokens": 20963980.0, + "step": 551 + }, + { + "epoch": 0.07022007378196159, + "grad_norm": 1.840760588645935, + "learning_rate": 2.3357354811360745e-07, + "loss": 0.547, + "mean_token_accuracy": 0.8302136659622192, + "num_tokens": 21006526.0, + "step": 552 + }, + { + "epoch": 0.07034728406055209, + "grad_norm": 2.0771484375, + "learning_rate": 2.3399745654938532e-07, + "loss": 0.502, + "mean_token_accuracy": 0.836821973323822, + "num_tokens": 21043277.0, + "step": 553 + }, + { + "epoch": 0.0704744943391426, + "grad_norm": 2.2236433029174805, + "learning_rate": 2.344213649851632e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.8371484279632568, + "num_tokens": 21077724.0, + "step": 554 + }, + { + "epoch": 0.07060170461773312, + "grad_norm": 2.002459764480591, + "learning_rate": 2.3484527342094106e-07, + "loss": 0.5541, + "mean_token_accuracy": 0.8261209726333618, + "num_tokens": 21120119.0, + "step": 555 + }, + { + "epoch": 0.07072891489632362, + "grad_norm": 2.1981818675994873, + "learning_rate": 2.3526918185671894e-07, + "loss": 0.5646, + "mean_token_accuracy": 0.821366548538208, + "num_tokens": 21163860.0, + "step": 556 + }, + { + "epoch": 0.07085612517491413, + "grad_norm": 2.4234619140625, + "learning_rate": 2.356930902924968e-07, + "loss": 0.5359, + "mean_token_accuracy": 0.8297144770622253, + "num_tokens": 21200571.0, + "step": 557 + }, + { + "epoch": 0.07098333545350465, + "grad_norm": 2.1195578575134277, + "learning_rate": 2.3611699872827468e-07, + "loss": 0.5108, + "mean_token_accuracy": 0.8340418934822083, + "num_tokens": 21241024.0, + "step": 558 + }, + { + "epoch": 0.07111054573209516, + "grad_norm": 2.5949647426605225, + "learning_rate": 2.3654090716405255e-07, + "loss": 0.5747, + "mean_token_accuracy": 0.8214447498321533, + "num_tokens": 21279595.0, + "step": 559 + }, + { + "epoch": 0.07123775601068566, + "grad_norm": 2.0957143306732178, + "learning_rate": 2.3696481559983043e-07, + "loss": 0.5705, + "mean_token_accuracy": 0.8215281963348389, + "num_tokens": 21318208.0, + "step": 560 + }, + { + "epoch": 0.07136496628927617, + "grad_norm": 2.133328437805176, + "learning_rate": 2.373887240356083e-07, + "loss": 0.5714, + "mean_token_accuracy": 0.8151705861091614, + "num_tokens": 21357353.0, + "step": 561 + }, + { + "epoch": 0.07149217656786669, + "grad_norm": 2.2467193603515625, + "learning_rate": 2.3781263247138617e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.8228709101676941, + "num_tokens": 21395113.0, + "step": 562 + }, + { + "epoch": 0.07161938684645719, + "grad_norm": 2.3520777225494385, + "learning_rate": 2.3823654090716404e-07, + "loss": 0.5784, + "mean_token_accuracy": 0.8150283098220825, + "num_tokens": 21430253.0, + "step": 563 + }, + { + "epoch": 0.0717465971250477, + "grad_norm": 2.3395261764526367, + "learning_rate": 2.386604493429419e-07, + "loss": 0.543, + "mean_token_accuracy": 0.8260012865066528, + "num_tokens": 21462332.0, + "step": 564 + }, + { + "epoch": 0.07187380740363822, + "grad_norm": 2.1508212089538574, + "learning_rate": 2.390843577787198e-07, + "loss": 0.5161, + "mean_token_accuracy": 0.8342629671096802, + "num_tokens": 21495388.0, + "step": 565 + }, + { + "epoch": 0.07200101768222872, + "grad_norm": 1.9257972240447998, + "learning_rate": 2.3950826621449766e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.8428194522857666, + "num_tokens": 21534056.0, + "step": 566 + }, + { + "epoch": 0.07212822796081923, + "grad_norm": 2.3215279579162598, + "learning_rate": 2.3993217465027556e-07, + "loss": 0.5525, + "mean_token_accuracy": 0.824089765548706, + "num_tokens": 21564484.0, + "step": 567 + }, + { + "epoch": 0.07225543823940975, + "grad_norm": 1.9896748065948486, + "learning_rate": 2.403560830860534e-07, + "loss": 0.5105, + "mean_token_accuracy": 0.834140956401825, + "num_tokens": 21605164.0, + "step": 568 + }, + { + "epoch": 0.07238264851800025, + "grad_norm": 1.9930840730667114, + "learning_rate": 2.4077999152183125e-07, + "loss": 0.557, + "mean_token_accuracy": 0.82342129945755, + "num_tokens": 21645488.0, + "step": 569 + }, + { + "epoch": 0.07250985879659076, + "grad_norm": 2.133687973022461, + "learning_rate": 2.4120389995760915e-07, + "loss": 0.5335, + "mean_token_accuracy": 0.8298789262771606, + "num_tokens": 21681406.0, + "step": 570 + }, + { + "epoch": 0.07263706907518128, + "grad_norm": 2.0815348625183105, + "learning_rate": 2.41627808393387e-07, + "loss": 0.5302, + "mean_token_accuracy": 0.8237735629081726, + "num_tokens": 21714882.0, + "step": 571 + }, + { + "epoch": 0.07276427935377179, + "grad_norm": 2.222425937652588, + "learning_rate": 2.420517168291649e-07, + "loss": 0.6034, + "mean_token_accuracy": 0.8089828491210938, + "num_tokens": 21749814.0, + "step": 572 + }, + { + "epoch": 0.07289148963236229, + "grad_norm": 1.9630287885665894, + "learning_rate": 2.4247562526494274e-07, + "loss": 0.5364, + "mean_token_accuracy": 0.8314760327339172, + "num_tokens": 21788079.0, + "step": 573 + }, + { + "epoch": 0.0730186999109528, + "grad_norm": 1.759160041809082, + "learning_rate": 2.4289953370072064e-07, + "loss": 0.5003, + "mean_token_accuracy": 0.8385434150695801, + "num_tokens": 21825981.0, + "step": 574 + }, + { + "epoch": 0.07314591018954332, + "grad_norm": 1.9977562427520752, + "learning_rate": 2.433234421364985e-07, + "loss": 0.5517, + "mean_token_accuracy": 0.8243956565856934, + "num_tokens": 21863669.0, + "step": 575 + }, + { + "epoch": 0.07327312046813382, + "grad_norm": 1.8654391765594482, + "learning_rate": 2.437473505722764e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8382377624511719, + "num_tokens": 21902236.0, + "step": 576 + }, + { + "epoch": 0.07340033074672433, + "grad_norm": 2.071692705154419, + "learning_rate": 2.4417125900805423e-07, + "loss": 0.5217, + "mean_token_accuracy": 0.8268342018127441, + "num_tokens": 21940028.0, + "step": 577 + }, + { + "epoch": 0.07352754102531485, + "grad_norm": 2.108372211456299, + "learning_rate": 2.4459516744383213e-07, + "loss": 0.5518, + "mean_token_accuracy": 0.8267173767089844, + "num_tokens": 21978565.0, + "step": 578 + }, + { + "epoch": 0.07365475130390535, + "grad_norm": 1.8322036266326904, + "learning_rate": 2.4501907587961e-07, + "loss": 0.5075, + "mean_token_accuracy": 0.8407233953475952, + "num_tokens": 22014715.0, + "step": 579 + }, + { + "epoch": 0.07378196158249586, + "grad_norm": 1.912639856338501, + "learning_rate": 2.454429843153879e-07, + "loss": 0.5372, + "mean_token_accuracy": 0.8270365595817566, + "num_tokens": 22055082.0, + "step": 580 + }, + { + "epoch": 0.07390917186108638, + "grad_norm": 2.0061888694763184, + "learning_rate": 2.458668927511657e-07, + "loss": 0.5228, + "mean_token_accuracy": 0.8323522806167603, + "num_tokens": 22091982.0, + "step": 581 + }, + { + "epoch": 0.07403638213967688, + "grad_norm": 1.9099377393722534, + "learning_rate": 2.462908011869436e-07, + "loss": 0.4971, + "mean_token_accuracy": 0.841784656047821, + "num_tokens": 22130683.0, + "step": 582 + }, + { + "epoch": 0.0741635924182674, + "grad_norm": 1.8783384561538696, + "learning_rate": 2.4671470962272147e-07, + "loss": 0.5622, + "mean_token_accuracy": 0.8231903314590454, + "num_tokens": 22173076.0, + "step": 583 + }, + { + "epoch": 0.07429080269685791, + "grad_norm": 1.8820831775665283, + "learning_rate": 2.4713861805849937e-07, + "loss": 0.5937, + "mean_token_accuracy": 0.8121581673622131, + "num_tokens": 22211048.0, + "step": 584 + }, + { + "epoch": 0.07441801297544842, + "grad_norm": 2.5256547927856445, + "learning_rate": 2.475625264942772e-07, + "loss": 0.4498, + "mean_token_accuracy": 0.8503242135047913, + "num_tokens": 22235702.0, + "step": 585 + }, + { + "epoch": 0.07454522325403892, + "grad_norm": 1.9182387590408325, + "learning_rate": 2.479864349300551e-07, + "loss": 0.5381, + "mean_token_accuracy": 0.8246052265167236, + "num_tokens": 22270316.0, + "step": 586 + }, + { + "epoch": 0.07467243353262944, + "grad_norm": 1.9177157878875732, + "learning_rate": 2.4841034336583296e-07, + "loss": 0.5481, + "mean_token_accuracy": 0.8260495662689209, + "num_tokens": 22308189.0, + "step": 587 + }, + { + "epoch": 0.07479964381121995, + "grad_norm": 1.9801691770553589, + "learning_rate": 2.488342518016108e-07, + "loss": 0.4911, + "mean_token_accuracy": 0.8384388089179993, + "num_tokens": 22345872.0, + "step": 588 + }, + { + "epoch": 0.07492685408981045, + "grad_norm": 1.8723366260528564, + "learning_rate": 2.492581602373887e-07, + "loss": 0.4983, + "mean_token_accuracy": 0.8390006422996521, + "num_tokens": 22385884.0, + "step": 589 + }, + { + "epoch": 0.07505406436840097, + "grad_norm": 1.8445531129837036, + "learning_rate": 2.4968206867316655e-07, + "loss": 0.5025, + "mean_token_accuracy": 0.8373310565948486, + "num_tokens": 22419210.0, + "step": 590 + }, + { + "epoch": 0.07518127464699148, + "grad_norm": 1.8573992252349854, + "learning_rate": 2.5010597710894445e-07, + "loss": 0.5134, + "mean_token_accuracy": 0.8346145153045654, + "num_tokens": 22461266.0, + "step": 591 + }, + { + "epoch": 0.07530848492558198, + "grad_norm": 1.7637585401535034, + "learning_rate": 2.505298855447223e-07, + "loss": 0.5142, + "mean_token_accuracy": 0.8321264982223511, + "num_tokens": 22500185.0, + "step": 592 + }, + { + "epoch": 0.0754356952041725, + "grad_norm": 1.9301912784576416, + "learning_rate": 2.509537939805002e-07, + "loss": 0.6404, + "mean_token_accuracy": 0.7960126399993896, + "num_tokens": 22544733.0, + "step": 593 + }, + { + "epoch": 0.07556290548276301, + "grad_norm": 1.9416136741638184, + "learning_rate": 2.513777024162781e-07, + "loss": 0.4588, + "mean_token_accuracy": 0.8498606085777283, + "num_tokens": 22585239.0, + "step": 594 + }, + { + "epoch": 0.07569011576135352, + "grad_norm": 1.945544958114624, + "learning_rate": 2.5180161085205594e-07, + "loss": 0.5192, + "mean_token_accuracy": 0.8349268436431885, + "num_tokens": 22630308.0, + "step": 595 + }, + { + "epoch": 0.07581732603994402, + "grad_norm": 1.7043112516403198, + "learning_rate": 2.522255192878338e-07, + "loss": 0.5013, + "mean_token_accuracy": 0.8395074605941772, + "num_tokens": 22671551.0, + "step": 596 + }, + { + "epoch": 0.07594453631853454, + "grad_norm": 1.7627307176589966, + "learning_rate": 2.526494277236117e-07, + "loss": 0.5295, + "mean_token_accuracy": 0.8283582925796509, + "num_tokens": 22707879.0, + "step": 597 + }, + { + "epoch": 0.07607174659712505, + "grad_norm": 1.8574488162994385, + "learning_rate": 2.530733361593896e-07, + "loss": 0.5875, + "mean_token_accuracy": 0.8145828247070312, + "num_tokens": 22750797.0, + "step": 598 + }, + { + "epoch": 0.07619895687571555, + "grad_norm": 1.920612096786499, + "learning_rate": 2.5349724459516743e-07, + "loss": 0.5444, + "mean_token_accuracy": 0.8311365246772766, + "num_tokens": 22786930.0, + "step": 599 + }, + { + "epoch": 0.07632616715430607, + "grad_norm": 2.0028738975524902, + "learning_rate": 2.539211530309453e-07, + "loss": 0.5033, + "mean_token_accuracy": 0.8336345553398132, + "num_tokens": 22822864.0, + "step": 600 + }, + { + "epoch": 0.07645337743289658, + "grad_norm": 1.8275113105773926, + "learning_rate": 2.543450614667232e-07, + "loss": 0.4971, + "mean_token_accuracy": 0.8390007019042969, + "num_tokens": 22861607.0, + "step": 601 + }, + { + "epoch": 0.07658058771148708, + "grad_norm": 1.9663527011871338, + "learning_rate": 2.547689699025011e-07, + "loss": 0.5933, + "mean_token_accuracy": 0.8111352324485779, + "num_tokens": 22895381.0, + "step": 602 + }, + { + "epoch": 0.0767077979900776, + "grad_norm": 2.0075008869171143, + "learning_rate": 2.551928783382789e-07, + "loss": 0.5086, + "mean_token_accuracy": 0.833016574382782, + "num_tokens": 22931722.0, + "step": 603 + }, + { + "epoch": 0.07683500826866811, + "grad_norm": 1.9694806337356567, + "learning_rate": 2.5561678677405677e-07, + "loss": 0.5899, + "mean_token_accuracy": 0.8118614554405212, + "num_tokens": 22970242.0, + "step": 604 + }, + { + "epoch": 0.07696221854725861, + "grad_norm": 2.108656644821167, + "learning_rate": 2.5604069520983467e-07, + "loss": 0.5191, + "mean_token_accuracy": 0.833148717880249, + "num_tokens": 23002512.0, + "step": 605 + }, + { + "epoch": 0.07708942882584913, + "grad_norm": 2.362189769744873, + "learning_rate": 2.564646036456125e-07, + "loss": 0.5304, + "mean_token_accuracy": 0.8302887678146362, + "num_tokens": 23037015.0, + "step": 606 + }, + { + "epoch": 0.07721663910443964, + "grad_norm": 1.7868719100952148, + "learning_rate": 2.568885120813904e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.8442944884300232, + "num_tokens": 23077967.0, + "step": 607 + }, + { + "epoch": 0.07734384938303016, + "grad_norm": 1.8185943365097046, + "learning_rate": 2.5731242051716826e-07, + "loss": 0.5254, + "mean_token_accuracy": 0.8314116597175598, + "num_tokens": 23120640.0, + "step": 608 + }, + { + "epoch": 0.07747105966162066, + "grad_norm": 1.7327975034713745, + "learning_rate": 2.5773632895294616e-07, + "loss": 0.4672, + "mean_token_accuracy": 0.8433519601821899, + "num_tokens": 23161255.0, + "step": 609 + }, + { + "epoch": 0.07759826994021117, + "grad_norm": 1.873711109161377, + "learning_rate": 2.58160237388724e-07, + "loss": 0.5562, + "mean_token_accuracy": 0.8225882053375244, + "num_tokens": 23197986.0, + "step": 610 + }, + { + "epoch": 0.07772548021880168, + "grad_norm": 1.9365453720092773, + "learning_rate": 2.585841458245019e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8409623503684998, + "num_tokens": 23230412.0, + "step": 611 + }, + { + "epoch": 0.07785269049739219, + "grad_norm": 1.6577703952789307, + "learning_rate": 2.5900805426027975e-07, + "loss": 0.4968, + "mean_token_accuracy": 0.8399919271469116, + "num_tokens": 23272550.0, + "step": 612 + }, + { + "epoch": 0.0779799007759827, + "grad_norm": 1.9993605613708496, + "learning_rate": 2.5943196269605765e-07, + "loss": 0.5362, + "mean_token_accuracy": 0.8276081085205078, + "num_tokens": 23307491.0, + "step": 613 + }, + { + "epoch": 0.07810711105457321, + "grad_norm": 1.8801175355911255, + "learning_rate": 2.598558711318355e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.8335307240486145, + "num_tokens": 23346260.0, + "step": 614 + }, + { + "epoch": 0.07823432133316371, + "grad_norm": 1.846917748451233, + "learning_rate": 2.602797795676134e-07, + "loss": 0.526, + "mean_token_accuracy": 0.832841157913208, + "num_tokens": 23383117.0, + "step": 615 + }, + { + "epoch": 0.07836153161175423, + "grad_norm": 1.834951400756836, + "learning_rate": 2.6070368800339124e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.8459662199020386, + "num_tokens": 23419696.0, + "step": 616 + }, + { + "epoch": 0.07848874189034474, + "grad_norm": 1.909207820892334, + "learning_rate": 2.6112759643916914e-07, + "loss": 0.5513, + "mean_token_accuracy": 0.8208819627761841, + "num_tokens": 23454746.0, + "step": 617 + }, + { + "epoch": 0.07861595216893524, + "grad_norm": 1.7740957736968994, + "learning_rate": 2.61551504874947e-07, + "loss": 0.5304, + "mean_token_accuracy": 0.8278148174285889, + "num_tokens": 23495361.0, + "step": 618 + }, + { + "epoch": 0.07874316244752576, + "grad_norm": 1.8680815696716309, + "learning_rate": 2.619754133107249e-07, + "loss": 0.5133, + "mean_token_accuracy": 0.8344178199768066, + "num_tokens": 23530899.0, + "step": 619 + }, + { + "epoch": 0.07887037272611627, + "grad_norm": 1.7887904644012451, + "learning_rate": 2.623993217465028e-07, + "loss": 0.5245, + "mean_token_accuracy": 0.8328065276145935, + "num_tokens": 23568338.0, + "step": 620 + }, + { + "epoch": 0.07899758300470679, + "grad_norm": 1.8784786462783813, + "learning_rate": 2.6282323018228063e-07, + "loss": 0.5147, + "mean_token_accuracy": 0.8360715508460999, + "num_tokens": 23606576.0, + "step": 621 + }, + { + "epoch": 0.07912479328329729, + "grad_norm": 1.9390041828155518, + "learning_rate": 2.632471386180585e-07, + "loss": 0.5335, + "mean_token_accuracy": 0.8295223712921143, + "num_tokens": 23649726.0, + "step": 622 + }, + { + "epoch": 0.0792520035618878, + "grad_norm": 1.8296620845794678, + "learning_rate": 2.6367104705383637e-07, + "loss": 0.5022, + "mean_token_accuracy": 0.8382902145385742, + "num_tokens": 23687209.0, + "step": 623 + }, + { + "epoch": 0.07937921384047832, + "grad_norm": 1.9291832447052002, + "learning_rate": 2.6409495548961427e-07, + "loss": 0.4911, + "mean_token_accuracy": 0.842900276184082, + "num_tokens": 23724614.0, + "step": 624 + }, + { + "epoch": 0.07950642411906882, + "grad_norm": 1.8619575500488281, + "learning_rate": 2.6451886392539206e-07, + "loss": 0.5404, + "mean_token_accuracy": 0.8277555108070374, + "num_tokens": 23764697.0, + "step": 625 + }, + { + "epoch": 0.07963363439765933, + "grad_norm": 1.8889374732971191, + "learning_rate": 2.6494277236116996e-07, + "loss": 0.5483, + "mean_token_accuracy": 0.8282493948936462, + "num_tokens": 23803869.0, + "step": 626 + }, + { + "epoch": 0.07976084467624985, + "grad_norm": 1.596637487411499, + "learning_rate": 2.6536668079694786e-07, + "loss": 0.4634, + "mean_token_accuracy": 0.8485259413719177, + "num_tokens": 23845926.0, + "step": 627 + }, + { + "epoch": 0.07988805495484035, + "grad_norm": 1.6872533559799194, + "learning_rate": 2.6579058923272576e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.8447583317756653, + "num_tokens": 23887114.0, + "step": 628 + }, + { + "epoch": 0.08001526523343086, + "grad_norm": 1.7144346237182617, + "learning_rate": 2.6621449766850356e-07, + "loss": 0.5117, + "mean_token_accuracy": 0.8364776372909546, + "num_tokens": 23926467.0, + "step": 629 + }, + { + "epoch": 0.08014247551202137, + "grad_norm": 1.5928452014923096, + "learning_rate": 2.6663840610428145e-07, + "loss": 0.5073, + "mean_token_accuracy": 0.8375973701477051, + "num_tokens": 23969549.0, + "step": 630 + }, + { + "epoch": 0.08026968579061187, + "grad_norm": 1.6107994318008423, + "learning_rate": 2.6706231454005935e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.8475143909454346, + "num_tokens": 24009779.0, + "step": 631 + }, + { + "epoch": 0.08039689606920239, + "grad_norm": 1.7782524824142456, + "learning_rate": 2.6748622297583725e-07, + "loss": 0.4996, + "mean_token_accuracy": 0.8369912505149841, + "num_tokens": 24048065.0, + "step": 632 + }, + { + "epoch": 0.0805241063477929, + "grad_norm": 1.9280928373336792, + "learning_rate": 2.6791013141161505e-07, + "loss": 0.5604, + "mean_token_accuracy": 0.8197498321533203, + "num_tokens": 24082592.0, + "step": 633 + }, + { + "epoch": 0.08065131662638342, + "grad_norm": 1.777462363243103, + "learning_rate": 2.6833403984739294e-07, + "loss": 0.522, + "mean_token_accuracy": 0.8305134773254395, + "num_tokens": 24118552.0, + "step": 634 + }, + { + "epoch": 0.08077852690497392, + "grad_norm": 1.977272868156433, + "learning_rate": 2.6875794828317084e-07, + "loss": 0.475, + "mean_token_accuracy": 0.8469948768615723, + "num_tokens": 24154565.0, + "step": 635 + }, + { + "epoch": 0.08090573718356443, + "grad_norm": 1.728829264640808, + "learning_rate": 2.6918185671894874e-07, + "loss": 0.554, + "mean_token_accuracy": 0.8207392692565918, + "num_tokens": 24198389.0, + "step": 636 + }, + { + "epoch": 0.08103294746215495, + "grad_norm": 1.9538094997406006, + "learning_rate": 2.6960576515472654e-07, + "loss": 0.5006, + "mean_token_accuracy": 0.8384045362472534, + "num_tokens": 24235850.0, + "step": 637 + }, + { + "epoch": 0.08116015774074545, + "grad_norm": 2.068404197692871, + "learning_rate": 2.7002967359050443e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.831559419631958, + "num_tokens": 24266856.0, + "step": 638 + }, + { + "epoch": 0.08128736801933596, + "grad_norm": 1.9887399673461914, + "learning_rate": 2.7045358202628233e-07, + "loss": 0.5959, + "mean_token_accuracy": 0.8131457567214966, + "num_tokens": 24300958.0, + "step": 639 + }, + { + "epoch": 0.08141457829792648, + "grad_norm": 1.828322410583496, + "learning_rate": 2.7087749046206023e-07, + "loss": 0.5151, + "mean_token_accuracy": 0.8337575197219849, + "num_tokens": 24339358.0, + "step": 640 + }, + { + "epoch": 0.08154178857651698, + "grad_norm": 1.5860055685043335, + "learning_rate": 2.71301398897838e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8419801592826843, + "num_tokens": 24384108.0, + "step": 641 + }, + { + "epoch": 0.08166899885510749, + "grad_norm": 1.6192482709884644, + "learning_rate": 2.717253073336159e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8446851968765259, + "num_tokens": 24424958.0, + "step": 642 + }, + { + "epoch": 0.081796209133698, + "grad_norm": 1.9237223863601685, + "learning_rate": 2.721492157693938e-07, + "loss": 0.5701, + "mean_token_accuracy": 0.8165209889411926, + "num_tokens": 24464535.0, + "step": 643 + }, + { + "epoch": 0.0819234194122885, + "grad_norm": 1.812971591949463, + "learning_rate": 2.7257312420517167e-07, + "loss": 0.5087, + "mean_token_accuracy": 0.8342213034629822, + "num_tokens": 24501969.0, + "step": 644 + }, + { + "epoch": 0.08205062969087902, + "grad_norm": 1.8532230854034424, + "learning_rate": 2.729970326409495e-07, + "loss": 0.5865, + "mean_token_accuracy": 0.8090596199035645, + "num_tokens": 24543782.0, + "step": 645 + }, + { + "epoch": 0.08217783996946953, + "grad_norm": 1.8437554836273193, + "learning_rate": 2.734209410767274e-07, + "loss": 0.4749, + "mean_token_accuracy": 0.8402624130249023, + "num_tokens": 24579055.0, + "step": 646 + }, + { + "epoch": 0.08230505024806005, + "grad_norm": 1.807063102722168, + "learning_rate": 2.738448495125053e-07, + "loss": 0.5001, + "mean_token_accuracy": 0.8425784111022949, + "num_tokens": 24615913.0, + "step": 647 + }, + { + "epoch": 0.08243226052665055, + "grad_norm": 1.9631296396255493, + "learning_rate": 2.7426875794828316e-07, + "loss": 0.5024, + "mean_token_accuracy": 0.8355819582939148, + "num_tokens": 24649612.0, + "step": 648 + }, + { + "epoch": 0.08255947080524106, + "grad_norm": 1.8159899711608887, + "learning_rate": 2.74692666384061e-07, + "loss": 0.5383, + "mean_token_accuracy": 0.8270196914672852, + "num_tokens": 24688118.0, + "step": 649 + }, + { + "epoch": 0.08268668108383158, + "grad_norm": 1.7805718183517456, + "learning_rate": 2.751165748198389e-07, + "loss": 0.5165, + "mean_token_accuracy": 0.8328081369400024, + "num_tokens": 24725457.0, + "step": 650 + }, + { + "epoch": 0.08281389136242208, + "grad_norm": 1.8355098962783813, + "learning_rate": 2.755404832556168e-07, + "loss": 0.5267, + "mean_token_accuracy": 0.8267443180084229, + "num_tokens": 24766840.0, + "step": 651 + }, + { + "epoch": 0.0829411016410126, + "grad_norm": 1.6960060596466064, + "learning_rate": 2.7596439169139465e-07, + "loss": 0.4907, + "mean_token_accuracy": 0.8415572643280029, + "num_tokens": 24806671.0, + "step": 652 + }, + { + "epoch": 0.08306831191960311, + "grad_norm": 1.736904263496399, + "learning_rate": 2.763883001271725e-07, + "loss": 0.5294, + "mean_token_accuracy": 0.830539345741272, + "num_tokens": 24850928.0, + "step": 653 + }, + { + "epoch": 0.08319552219819361, + "grad_norm": 1.788981318473816, + "learning_rate": 2.768122085629504e-07, + "loss": 0.4679, + "mean_token_accuracy": 0.8467607498168945, + "num_tokens": 24886812.0, + "step": 654 + }, + { + "epoch": 0.08332273247678412, + "grad_norm": 1.7121330499649048, + "learning_rate": 2.772361169987283e-07, + "loss": 0.5517, + "mean_token_accuracy": 0.8227304816246033, + "num_tokens": 24929560.0, + "step": 655 + }, + { + "epoch": 0.08344994275537464, + "grad_norm": 1.7675036191940308, + "learning_rate": 2.7766002543450614e-07, + "loss": 0.522, + "mean_token_accuracy": 0.8316895961761475, + "num_tokens": 24969626.0, + "step": 656 + }, + { + "epoch": 0.08357715303396514, + "grad_norm": 1.822234869003296, + "learning_rate": 2.78083933870284e-07, + "loss": 0.5286, + "mean_token_accuracy": 0.8251035213470459, + "num_tokens": 25006804.0, + "step": 657 + }, + { + "epoch": 0.08370436331255565, + "grad_norm": 1.7167259454727173, + "learning_rate": 2.785078423060619e-07, + "loss": 0.5119, + "mean_token_accuracy": 0.8357548117637634, + "num_tokens": 25043493.0, + "step": 658 + }, + { + "epoch": 0.08383157359114617, + "grad_norm": 1.7973616123199463, + "learning_rate": 2.789317507418398e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8415886163711548, + "num_tokens": 25075063.0, + "step": 659 + }, + { + "epoch": 0.08395878386973668, + "grad_norm": 1.7862191200256348, + "learning_rate": 2.7935565917761763e-07, + "loss": 0.5219, + "mean_token_accuracy": 0.8340678811073303, + "num_tokens": 25112402.0, + "step": 660 + }, + { + "epoch": 0.08408599414832718, + "grad_norm": 1.8035669326782227, + "learning_rate": 2.797795676133955e-07, + "loss": 0.5575, + "mean_token_accuracy": 0.8245758414268494, + "num_tokens": 25154315.0, + "step": 661 + }, + { + "epoch": 0.0842132044269177, + "grad_norm": 1.5634989738464355, + "learning_rate": 2.802034760491734e-07, + "loss": 0.503, + "mean_token_accuracy": 0.8406793475151062, + "num_tokens": 25195443.0, + "step": 662 + }, + { + "epoch": 0.08434041470550821, + "grad_norm": 1.850490689277649, + "learning_rate": 2.806273844849512e-07, + "loss": 0.553, + "mean_token_accuracy": 0.8222272992134094, + "num_tokens": 25232600.0, + "step": 663 + }, + { + "epoch": 0.08446762498409871, + "grad_norm": 1.5910552740097046, + "learning_rate": 2.810512929207291e-07, + "loss": 0.4403, + "mean_token_accuracy": 0.8547884225845337, + "num_tokens": 25270131.0, + "step": 664 + }, + { + "epoch": 0.08459483526268922, + "grad_norm": 1.7020171880722046, + "learning_rate": 2.8147520135650697e-07, + "loss": 0.478, + "mean_token_accuracy": 0.8468747138977051, + "num_tokens": 25314616.0, + "step": 665 + }, + { + "epoch": 0.08472204554127974, + "grad_norm": 1.886509656906128, + "learning_rate": 2.8189910979228487e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.8337503671646118, + "num_tokens": 25347076.0, + "step": 666 + }, + { + "epoch": 0.08484925581987024, + "grad_norm": 1.6679308414459229, + "learning_rate": 2.823230182280627e-07, + "loss": 0.5181, + "mean_token_accuracy": 0.832751452922821, + "num_tokens": 25388360.0, + "step": 667 + }, + { + "epoch": 0.08497646609846075, + "grad_norm": 2.0317468643188477, + "learning_rate": 2.827469266638406e-07, + "loss": 0.4565, + "mean_token_accuracy": 0.8501204252243042, + "num_tokens": 25418449.0, + "step": 668 + }, + { + "epoch": 0.08510367637705127, + "grad_norm": 1.8089511394500732, + "learning_rate": 2.8317083509961846e-07, + "loss": 0.506, + "mean_token_accuracy": 0.8359116315841675, + "num_tokens": 25453547.0, + "step": 669 + }, + { + "epoch": 0.08523088665564178, + "grad_norm": 1.8802404403686523, + "learning_rate": 2.8359474353539636e-07, + "loss": 0.5682, + "mean_token_accuracy": 0.821639358997345, + "num_tokens": 25487552.0, + "step": 670 + }, + { + "epoch": 0.08535809693423228, + "grad_norm": 1.534368872642517, + "learning_rate": 2.840186519711742e-07, + "loss": 0.4346, + "mean_token_accuracy": 0.8564454913139343, + "num_tokens": 25526627.0, + "step": 671 + }, + { + "epoch": 0.0854853072128228, + "grad_norm": 1.6609195470809937, + "learning_rate": 2.844425604069521e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.8397167325019836, + "num_tokens": 25565544.0, + "step": 672 + }, + { + "epoch": 0.08561251749141331, + "grad_norm": 1.9373246431350708, + "learning_rate": 2.8486646884272995e-07, + "loss": 0.5697, + "mean_token_accuracy": 0.8184062242507935, + "num_tokens": 25600876.0, + "step": 673 + }, + { + "epoch": 0.08573972777000381, + "grad_norm": 1.766626000404358, + "learning_rate": 2.8529037727850785e-07, + "loss": 0.4721, + "mean_token_accuracy": 0.8469036817550659, + "num_tokens": 25639620.0, + "step": 674 + }, + { + "epoch": 0.08586693804859433, + "grad_norm": 1.706300139427185, + "learning_rate": 2.857142857142857e-07, + "loss": 0.471, + "mean_token_accuracy": 0.8475594520568848, + "num_tokens": 25673639.0, + "step": 675 + }, + { + "epoch": 0.08599414832718484, + "grad_norm": 1.6895601749420166, + "learning_rate": 2.861381941500636e-07, + "loss": 0.5054, + "mean_token_accuracy": 0.8338273763656616, + "num_tokens": 25713694.0, + "step": 676 + }, + { + "epoch": 0.08612135860577534, + "grad_norm": 2.0813679695129395, + "learning_rate": 2.8656210258584144e-07, + "loss": 0.5508, + "mean_token_accuracy": 0.8171586990356445, + "num_tokens": 25745459.0, + "step": 677 + }, + { + "epoch": 0.08624856888436586, + "grad_norm": 1.6845381259918213, + "learning_rate": 2.869860110216193e-07, + "loss": 0.5252, + "mean_token_accuracy": 0.8312962651252747, + "num_tokens": 25787297.0, + "step": 678 + }, + { + "epoch": 0.08637577916295637, + "grad_norm": 1.7807605266571045, + "learning_rate": 2.874099194573972e-07, + "loss": 0.525, + "mean_token_accuracy": 0.8319361805915833, + "num_tokens": 25826709.0, + "step": 679 + }, + { + "epoch": 0.08650298944154687, + "grad_norm": 1.8200000524520874, + "learning_rate": 2.878338278931751e-07, + "loss": 0.48, + "mean_token_accuracy": 0.8398492336273193, + "num_tokens": 25861973.0, + "step": 680 + }, + { + "epoch": 0.08663019972013739, + "grad_norm": 1.608514428138733, + "learning_rate": 2.8825773632895293e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8432475328445435, + "num_tokens": 25901390.0, + "step": 681 + }, + { + "epoch": 0.0867574099987279, + "grad_norm": 1.6737388372421265, + "learning_rate": 2.886816447647308e-07, + "loss": 0.5863, + "mean_token_accuracy": 0.8112400770187378, + "num_tokens": 25941533.0, + "step": 682 + }, + { + "epoch": 0.08688462027731841, + "grad_norm": 1.596106767654419, + "learning_rate": 2.891055532005087e-07, + "loss": 0.4672, + "mean_token_accuracy": 0.8470249176025391, + "num_tokens": 25980042.0, + "step": 683 + }, + { + "epoch": 0.08701183055590891, + "grad_norm": 1.7477632761001587, + "learning_rate": 2.8952946163628657e-07, + "loss": 0.5317, + "mean_token_accuracy": 0.8311672210693359, + "num_tokens": 26020466.0, + "step": 684 + }, + { + "epoch": 0.08713904083449943, + "grad_norm": 1.6416492462158203, + "learning_rate": 2.899533700720644e-07, + "loss": 0.5175, + "mean_token_accuracy": 0.8331018686294556, + "num_tokens": 26057479.0, + "step": 685 + }, + { + "epoch": 0.08726625111308994, + "grad_norm": 1.6763356924057007, + "learning_rate": 2.9037727850784227e-07, + "loss": 0.497, + "mean_token_accuracy": 0.8407440781593323, + "num_tokens": 26095185.0, + "step": 686 + }, + { + "epoch": 0.08739346139168044, + "grad_norm": 1.8250908851623535, + "learning_rate": 2.9080118694362016e-07, + "loss": 0.5005, + "mean_token_accuracy": 0.8392454385757446, + "num_tokens": 26130967.0, + "step": 687 + }, + { + "epoch": 0.08752067167027096, + "grad_norm": 1.7910879850387573, + "learning_rate": 2.9122509537939806e-07, + "loss": 0.5091, + "mean_token_accuracy": 0.8345446586608887, + "num_tokens": 26167198.0, + "step": 688 + }, + { + "epoch": 0.08764788194886147, + "grad_norm": 1.7967193126678467, + "learning_rate": 2.916490038151759e-07, + "loss": 0.4974, + "mean_token_accuracy": 0.8422949910163879, + "num_tokens": 26198493.0, + "step": 689 + }, + { + "epoch": 0.08777509222745197, + "grad_norm": 1.7116880416870117, + "learning_rate": 2.9207291225095376e-07, + "loss": 0.5137, + "mean_token_accuracy": 0.8322779536247253, + "num_tokens": 26234509.0, + "step": 690 + }, + { + "epoch": 0.08790230250604249, + "grad_norm": 1.7602695226669312, + "learning_rate": 2.9249682068673166e-07, + "loss": 0.5172, + "mean_token_accuracy": 0.8330784440040588, + "num_tokens": 26269705.0, + "step": 691 + }, + { + "epoch": 0.088029512784633, + "grad_norm": 1.8303918838500977, + "learning_rate": 2.9292072912250955e-07, + "loss": 0.5051, + "mean_token_accuracy": 0.8324469923973083, + "num_tokens": 26309452.0, + "step": 692 + }, + { + "epoch": 0.0881567230632235, + "grad_norm": 1.6674166917800903, + "learning_rate": 2.933446375582874e-07, + "loss": 0.5133, + "mean_token_accuracy": 0.8355351686477661, + "num_tokens": 26351487.0, + "step": 693 + }, + { + "epoch": 0.08828393334181402, + "grad_norm": 1.5769070386886597, + "learning_rate": 2.9376854599406525e-07, + "loss": 0.4599, + "mean_token_accuracy": 0.8507313132286072, + "num_tokens": 26392303.0, + "step": 694 + }, + { + "epoch": 0.08841114362040453, + "grad_norm": 1.6945476531982422, + "learning_rate": 2.9419245442984315e-07, + "loss": 0.5281, + "mean_token_accuracy": 0.8280899524688721, + "num_tokens": 26430576.0, + "step": 695 + }, + { + "epoch": 0.08853835389899505, + "grad_norm": 1.7927781343460083, + "learning_rate": 2.9461636286562104e-07, + "loss": 0.5156, + "mean_token_accuracy": 0.8338578939437866, + "num_tokens": 26467606.0, + "step": 696 + }, + { + "epoch": 0.08866556417758555, + "grad_norm": 1.7358167171478271, + "learning_rate": 2.9504027130139884e-07, + "loss": 0.4992, + "mean_token_accuracy": 0.840364933013916, + "num_tokens": 26503192.0, + "step": 697 + }, + { + "epoch": 0.08879277445617606, + "grad_norm": 1.850068211555481, + "learning_rate": 2.9546417973717674e-07, + "loss": 0.5348, + "mean_token_accuracy": 0.8296949863433838, + "num_tokens": 26541786.0, + "step": 698 + }, + { + "epoch": 0.08891998473476657, + "grad_norm": 1.8546022176742554, + "learning_rate": 2.9588808817295464e-07, + "loss": 0.5162, + "mean_token_accuracy": 0.8373574018478394, + "num_tokens": 26580088.0, + "step": 699 + }, + { + "epoch": 0.08904719501335707, + "grad_norm": 1.827176570892334, + "learning_rate": 2.9631199660873253e-07, + "loss": 0.5081, + "mean_token_accuracy": 0.8336479663848877, + "num_tokens": 26614795.0, + "step": 700 + }, + { + "epoch": 0.08917440529194759, + "grad_norm": 1.836633324623108, + "learning_rate": 2.9673590504451033e-07, + "loss": 0.492, + "mean_token_accuracy": 0.842118501663208, + "num_tokens": 26646253.0, + "step": 701 + }, + { + "epoch": 0.0893016155705381, + "grad_norm": 1.6434762477874756, + "learning_rate": 2.9715981348028823e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.8425525426864624, + "num_tokens": 26683195.0, + "step": 702 + }, + { + "epoch": 0.0894288258491286, + "grad_norm": 1.7268610000610352, + "learning_rate": 2.975837219160661e-07, + "loss": 0.5119, + "mean_token_accuracy": 0.8344510197639465, + "num_tokens": 26723460.0, + "step": 703 + }, + { + "epoch": 0.08955603612771912, + "grad_norm": 1.792924165725708, + "learning_rate": 2.98007630351844e-07, + "loss": 0.5395, + "mean_token_accuracy": 0.8256150484085083, + "num_tokens": 26766093.0, + "step": 704 + }, + { + "epoch": 0.08968324640630963, + "grad_norm": 1.7634607553482056, + "learning_rate": 2.984315387876218e-07, + "loss": 0.5256, + "mean_token_accuracy": 0.8319228887557983, + "num_tokens": 26801331.0, + "step": 705 + }, + { + "epoch": 0.08981045668490013, + "grad_norm": 1.7769079208374023, + "learning_rate": 2.988554472233997e-07, + "loss": 0.549, + "mean_token_accuracy": 0.8243420124053955, + "num_tokens": 26837869.0, + "step": 706 + }, + { + "epoch": 0.08993766696349065, + "grad_norm": 1.5672944784164429, + "learning_rate": 2.992793556591776e-07, + "loss": 0.5114, + "mean_token_accuracy": 0.8333484530448914, + "num_tokens": 26881948.0, + "step": 707 + }, + { + "epoch": 0.09006487724208116, + "grad_norm": 1.5479600429534912, + "learning_rate": 2.997032640949555e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.84311842918396, + "num_tokens": 26924137.0, + "step": 708 + }, + { + "epoch": 0.09019208752067168, + "grad_norm": 1.6125621795654297, + "learning_rate": 3.001271725307333e-07, + "loss": 0.5037, + "mean_token_accuracy": 0.8382986783981323, + "num_tokens": 26965548.0, + "step": 709 + }, + { + "epoch": 0.09031929779926218, + "grad_norm": 1.7127623558044434, + "learning_rate": 3.005510809665112e-07, + "loss": 0.5469, + "mean_token_accuracy": 0.821901261806488, + "num_tokens": 27003048.0, + "step": 710 + }, + { + "epoch": 0.09044650807785269, + "grad_norm": 1.7484239339828491, + "learning_rate": 3.009749894022891e-07, + "loss": 0.5646, + "mean_token_accuracy": 0.8211275339126587, + "num_tokens": 27045598.0, + "step": 711 + }, + { + "epoch": 0.0905737183564432, + "grad_norm": 1.7820619344711304, + "learning_rate": 3.01398897838067e-07, + "loss": 0.4767, + "mean_token_accuracy": 0.8464570045471191, + "num_tokens": 27082757.0, + "step": 712 + }, + { + "epoch": 0.0907009286350337, + "grad_norm": 2.0392656326293945, + "learning_rate": 3.018228062738448e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.8389930725097656, + "num_tokens": 27112083.0, + "step": 713 + }, + { + "epoch": 0.09082813891362422, + "grad_norm": 1.6741034984588623, + "learning_rate": 3.022467147096227e-07, + "loss": 0.5032, + "mean_token_accuracy": 0.8376057147979736, + "num_tokens": 27150623.0, + "step": 714 + }, + { + "epoch": 0.09095534919221474, + "grad_norm": 1.7400708198547363, + "learning_rate": 3.026706231454006e-07, + "loss": 0.5305, + "mean_token_accuracy": 0.8292508721351624, + "num_tokens": 27189767.0, + "step": 715 + }, + { + "epoch": 0.09108255947080524, + "grad_norm": 1.7430157661437988, + "learning_rate": 3.0309453158117844e-07, + "loss": 0.485, + "mean_token_accuracy": 0.8437708020210266, + "num_tokens": 27225999.0, + "step": 716 + }, + { + "epoch": 0.09120976974939575, + "grad_norm": 1.6655746698379517, + "learning_rate": 3.035184400169563e-07, + "loss": 0.4787, + "mean_token_accuracy": 0.8428698182106018, + "num_tokens": 27265305.0, + "step": 717 + }, + { + "epoch": 0.09133698002798626, + "grad_norm": 1.7018826007843018, + "learning_rate": 3.039423484527342e-07, + "loss": 0.5664, + "mean_token_accuracy": 0.8183097243309021, + "num_tokens": 27307314.0, + "step": 718 + }, + { + "epoch": 0.09146419030657676, + "grad_norm": 1.675166130065918, + "learning_rate": 3.043662568885121e-07, + "loss": 0.5252, + "mean_token_accuracy": 0.8258765935897827, + "num_tokens": 27343963.0, + "step": 719 + }, + { + "epoch": 0.09159140058516728, + "grad_norm": 1.6294788122177124, + "learning_rate": 3.0479016532428993e-07, + "loss": 0.4629, + "mean_token_accuracy": 0.8511053323745728, + "num_tokens": 27383757.0, + "step": 720 + }, + { + "epoch": 0.0917186108637578, + "grad_norm": 1.622760534286499, + "learning_rate": 3.052140737600678e-07, + "loss": 0.5002, + "mean_token_accuracy": 0.8366155028343201, + "num_tokens": 27423110.0, + "step": 721 + }, + { + "epoch": 0.09184582114234831, + "grad_norm": 1.5926071405410767, + "learning_rate": 3.056379821958457e-07, + "loss": 0.4827, + "mean_token_accuracy": 0.8448187112808228, + "num_tokens": 27463211.0, + "step": 722 + }, + { + "epoch": 0.09197303142093881, + "grad_norm": 1.780780553817749, + "learning_rate": 3.060618906316236e-07, + "loss": 0.5355, + "mean_token_accuracy": 0.826907217502594, + "num_tokens": 27499362.0, + "step": 723 + }, + { + "epoch": 0.09210024169952932, + "grad_norm": 1.6891191005706787, + "learning_rate": 3.064857990674014e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8386951684951782, + "num_tokens": 27540093.0, + "step": 724 + }, + { + "epoch": 0.09222745197811984, + "grad_norm": 1.7270945310592651, + "learning_rate": 3.0690970750317927e-07, + "loss": 0.4773, + "mean_token_accuracy": 0.8422783017158508, + "num_tokens": 27581269.0, + "step": 725 + }, + { + "epoch": 0.09235466225671034, + "grad_norm": 1.7323949337005615, + "learning_rate": 3.0733361593895717e-07, + "loss": 0.5127, + "mean_token_accuracy": 0.8322663307189941, + "num_tokens": 27619017.0, + "step": 726 + }, + { + "epoch": 0.09248187253530085, + "grad_norm": 1.5994335412979126, + "learning_rate": 3.0775752437473507e-07, + "loss": 0.4822, + "mean_token_accuracy": 0.8455902338027954, + "num_tokens": 27661568.0, + "step": 727 + }, + { + "epoch": 0.09260908281389137, + "grad_norm": 1.6830593347549438, + "learning_rate": 3.081814328105129e-07, + "loss": 0.5154, + "mean_token_accuracy": 0.8359675407409668, + "num_tokens": 27697846.0, + "step": 728 + }, + { + "epoch": 0.09273629309248187, + "grad_norm": 1.5708014965057373, + "learning_rate": 3.0860534124629076e-07, + "loss": 0.5367, + "mean_token_accuracy": 0.8258227109909058, + "num_tokens": 27742964.0, + "step": 729 + }, + { + "epoch": 0.09286350337107238, + "grad_norm": 1.8131051063537598, + "learning_rate": 3.0902924968206866e-07, + "loss": 0.5424, + "mean_token_accuracy": 0.8296607732772827, + "num_tokens": 27780702.0, + "step": 730 + }, + { + "epoch": 0.0929907136496629, + "grad_norm": 1.7243068218231201, + "learning_rate": 3.0945315811784656e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.8380707502365112, + "num_tokens": 27819669.0, + "step": 731 + }, + { + "epoch": 0.0931179239282534, + "grad_norm": 1.6073901653289795, + "learning_rate": 3.098770665536244e-07, + "loss": 0.5345, + "mean_token_accuracy": 0.829133152961731, + "num_tokens": 27860076.0, + "step": 732 + }, + { + "epoch": 0.09324513420684391, + "grad_norm": 1.457829475402832, + "learning_rate": 3.1030097498940225e-07, + "loss": 0.493, + "mean_token_accuracy": 0.8381051421165466, + "num_tokens": 27902748.0, + "step": 733 + }, + { + "epoch": 0.09337234448543442, + "grad_norm": 1.8148529529571533, + "learning_rate": 3.1072488342518015e-07, + "loss": 0.5443, + "mean_token_accuracy": 0.8262979388237, + "num_tokens": 27937917.0, + "step": 734 + }, + { + "epoch": 0.09349955476402494, + "grad_norm": 1.7446306943893433, + "learning_rate": 3.11148791860958e-07, + "loss": 0.4942, + "mean_token_accuracy": 0.8374800682067871, + "num_tokens": 27974054.0, + "step": 735 + }, + { + "epoch": 0.09362676504261544, + "grad_norm": 1.710761547088623, + "learning_rate": 3.115727002967359e-07, + "loss": 0.5201, + "mean_token_accuracy": 0.8288262486457825, + "num_tokens": 28015423.0, + "step": 736 + }, + { + "epoch": 0.09375397532120595, + "grad_norm": 1.6918097734451294, + "learning_rate": 3.1199660873251374e-07, + "loss": 0.507, + "mean_token_accuracy": 0.8368070721626282, + "num_tokens": 28054041.0, + "step": 737 + }, + { + "epoch": 0.09388118559979647, + "grad_norm": 1.6125901937484741, + "learning_rate": 3.1242051716829164e-07, + "loss": 0.4889, + "mean_token_accuracy": 0.8448787927627563, + "num_tokens": 28091680.0, + "step": 738 + }, + { + "epoch": 0.09400839587838697, + "grad_norm": 1.597694993019104, + "learning_rate": 3.128444256040695e-07, + "loss": 0.5126, + "mean_token_accuracy": 0.8315802812576294, + "num_tokens": 28134857.0, + "step": 739 + }, + { + "epoch": 0.09413560615697748, + "grad_norm": 1.661919116973877, + "learning_rate": 3.132683340398474e-07, + "loss": 0.4802, + "mean_token_accuracy": 0.8446018695831299, + "num_tokens": 28174506.0, + "step": 740 + }, + { + "epoch": 0.094262816435568, + "grad_norm": 1.6855692863464355, + "learning_rate": 3.1369224247562523e-07, + "loss": 0.5011, + "mean_token_accuracy": 0.8385671377182007, + "num_tokens": 28213412.0, + "step": 741 + }, + { + "epoch": 0.0943900267141585, + "grad_norm": 1.5643469095230103, + "learning_rate": 3.1411615091140313e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.837399959564209, + "num_tokens": 28254789.0, + "step": 742 + }, + { + "epoch": 0.09451723699274901, + "grad_norm": 1.7060108184814453, + "learning_rate": 3.14540059347181e-07, + "loss": 0.5449, + "mean_token_accuracy": 0.8245723247528076, + "num_tokens": 28292659.0, + "step": 743 + }, + { + "epoch": 0.09464444727133953, + "grad_norm": 1.8165286779403687, + "learning_rate": 3.149639677829589e-07, + "loss": 0.5882, + "mean_token_accuracy": 0.8097468614578247, + "num_tokens": 28328137.0, + "step": 744 + }, + { + "epoch": 0.09477165754993004, + "grad_norm": 1.8481448888778687, + "learning_rate": 3.153878762187368e-07, + "loss": 0.5049, + "mean_token_accuracy": 0.8378819227218628, + "num_tokens": 28362354.0, + "step": 745 + }, + { + "epoch": 0.09489886782852054, + "grad_norm": 1.845745325088501, + "learning_rate": 3.158117846545146e-07, + "loss": 0.5648, + "mean_token_accuracy": 0.8143605589866638, + "num_tokens": 28396730.0, + "step": 746 + }, + { + "epoch": 0.09502607810711106, + "grad_norm": 1.5024909973144531, + "learning_rate": 3.1623569309029247e-07, + "loss": 0.4804, + "mean_token_accuracy": 0.8450164794921875, + "num_tokens": 28436091.0, + "step": 747 + }, + { + "epoch": 0.09515328838570157, + "grad_norm": 1.7050414085388184, + "learning_rate": 3.1665960152607037e-07, + "loss": 0.4996, + "mean_token_accuracy": 0.8366392254829407, + "num_tokens": 28473123.0, + "step": 748 + }, + { + "epoch": 0.09528049866429207, + "grad_norm": 1.6414626836776733, + "learning_rate": 3.1708350996184826e-07, + "loss": 0.4807, + "mean_token_accuracy": 0.8440388441085815, + "num_tokens": 28514971.0, + "step": 749 + }, + { + "epoch": 0.09540770894288259, + "grad_norm": 1.7236766815185547, + "learning_rate": 3.175074183976261e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.8402162790298462, + "num_tokens": 28549600.0, + "step": 750 + }, + { + "epoch": 0.0955349192214731, + "grad_norm": 1.7791345119476318, + "learning_rate": 3.1793132683340396e-07, + "loss": 0.5349, + "mean_token_accuracy": 0.8240166902542114, + "num_tokens": 28585805.0, + "step": 751 + }, + { + "epoch": 0.0956621295000636, + "grad_norm": 1.6496583223342896, + "learning_rate": 3.1835523526918186e-07, + "loss": 0.5096, + "mean_token_accuracy": 0.8302061557769775, + "num_tokens": 28623167.0, + "step": 752 + }, + { + "epoch": 0.09578933977865411, + "grad_norm": 1.7210521697998047, + "learning_rate": 3.1877914370495975e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8362024426460266, + "num_tokens": 28659304.0, + "step": 753 + }, + { + "epoch": 0.09591655005724463, + "grad_norm": 1.7158215045928955, + "learning_rate": 3.1920305214073755e-07, + "loss": 0.5351, + "mean_token_accuracy": 0.8257791996002197, + "num_tokens": 28696807.0, + "step": 754 + }, + { + "epoch": 0.09604376033583513, + "grad_norm": 1.728735089302063, + "learning_rate": 3.1962696057651545e-07, + "loss": 0.5565, + "mean_token_accuracy": 0.8222265243530273, + "num_tokens": 28739026.0, + "step": 755 + }, + { + "epoch": 0.09617097061442564, + "grad_norm": 1.9332627058029175, + "learning_rate": 3.2005086901229335e-07, + "loss": 0.5157, + "mean_token_accuracy": 0.8315415382385254, + "num_tokens": 28769106.0, + "step": 756 + }, + { + "epoch": 0.09629818089301616, + "grad_norm": 1.7191325426101685, + "learning_rate": 3.2047477744807125e-07, + "loss": 0.5609, + "mean_token_accuracy": 0.81818026304245, + "num_tokens": 28807400.0, + "step": 757 + }, + { + "epoch": 0.09642539117160667, + "grad_norm": 1.7032963037490845, + "learning_rate": 3.2089868588384904e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.8471060991287231, + "num_tokens": 28841765.0, + "step": 758 + }, + { + "epoch": 0.09655260145019717, + "grad_norm": 1.7482112646102905, + "learning_rate": 3.2132259431962694e-07, + "loss": 0.5417, + "mean_token_accuracy": 0.8248437643051147, + "num_tokens": 28881805.0, + "step": 759 + }, + { + "epoch": 0.09667981172878769, + "grad_norm": 1.8107284307479858, + "learning_rate": 3.2174650275540484e-07, + "loss": 0.5147, + "mean_token_accuracy": 0.8311343193054199, + "num_tokens": 28917565.0, + "step": 760 + }, + { + "epoch": 0.0968070220073782, + "grad_norm": 1.6930954456329346, + "learning_rate": 3.2217041119118274e-07, + "loss": 0.473, + "mean_token_accuracy": 0.846166729927063, + "num_tokens": 28951143.0, + "step": 761 + }, + { + "epoch": 0.0969342322859687, + "grad_norm": 1.701450228691101, + "learning_rate": 3.2259431962696053e-07, + "loss": 0.5085, + "mean_token_accuracy": 0.8354460597038269, + "num_tokens": 28987978.0, + "step": 762 + }, + { + "epoch": 0.09706144256455922, + "grad_norm": 1.6542117595672607, + "learning_rate": 3.2301822806273843e-07, + "loss": 0.5033, + "mean_token_accuracy": 0.8364375233650208, + "num_tokens": 29025341.0, + "step": 763 + }, + { + "epoch": 0.09718865284314973, + "grad_norm": 1.8327654600143433, + "learning_rate": 3.2344213649851633e-07, + "loss": 0.5299, + "mean_token_accuracy": 0.8276492953300476, + "num_tokens": 29058185.0, + "step": 764 + }, + { + "epoch": 0.09731586312174023, + "grad_norm": 1.5356041193008423, + "learning_rate": 3.238660449342942e-07, + "loss": 0.4688, + "mean_token_accuracy": 0.8455986976623535, + "num_tokens": 29097540.0, + "step": 765 + }, + { + "epoch": 0.09744307340033075, + "grad_norm": 1.7343838214874268, + "learning_rate": 3.24289953370072e-07, + "loss": 0.4924, + "mean_token_accuracy": 0.8378843069076538, + "num_tokens": 29133099.0, + "step": 766 + }, + { + "epoch": 0.09757028367892126, + "grad_norm": 1.5334267616271973, + "learning_rate": 3.247138618058499e-07, + "loss": 0.5455, + "mean_token_accuracy": 0.8266257047653198, + "num_tokens": 29178893.0, + "step": 767 + }, + { + "epoch": 0.09769749395751176, + "grad_norm": 1.625523567199707, + "learning_rate": 3.251377702416278e-07, + "loss": 0.4814, + "mean_token_accuracy": 0.8452816009521484, + "num_tokens": 29214791.0, + "step": 768 + }, + { + "epoch": 0.09782470423610228, + "grad_norm": 1.6716225147247314, + "learning_rate": 3.255616786774057e-07, + "loss": 0.485, + "mean_token_accuracy": 0.8412166237831116, + "num_tokens": 29251252.0, + "step": 769 + }, + { + "epoch": 0.09795191451469279, + "grad_norm": 1.705157995223999, + "learning_rate": 3.259855871131835e-07, + "loss": 0.4596, + "mean_token_accuracy": 0.8494060039520264, + "num_tokens": 29285666.0, + "step": 770 + }, + { + "epoch": 0.0980791247932833, + "grad_norm": 1.526384949684143, + "learning_rate": 3.264094955489614e-07, + "loss": 0.4638, + "mean_token_accuracy": 0.8463700413703918, + "num_tokens": 29328670.0, + "step": 771 + }, + { + "epoch": 0.0982063350718738, + "grad_norm": 1.597875952720642, + "learning_rate": 3.268334039847393e-07, + "loss": 0.5351, + "mean_token_accuracy": 0.8269250988960266, + "num_tokens": 29367942.0, + "step": 772 + }, + { + "epoch": 0.09833354535046432, + "grad_norm": 1.8594334125518799, + "learning_rate": 3.2725731242051715e-07, + "loss": 0.5008, + "mean_token_accuracy": 0.8376799821853638, + "num_tokens": 29402051.0, + "step": 773 + }, + { + "epoch": 0.09846075562905483, + "grad_norm": 1.637868881225586, + "learning_rate": 3.27681220856295e-07, + "loss": 0.5377, + "mean_token_accuracy": 0.8323352336883545, + "num_tokens": 29442832.0, + "step": 774 + }, + { + "epoch": 0.09858796590764533, + "grad_norm": 1.724916696548462, + "learning_rate": 3.281051292920729e-07, + "loss": 0.5563, + "mean_token_accuracy": 0.821323037147522, + "num_tokens": 29480015.0, + "step": 775 + }, + { + "epoch": 0.09871517618623585, + "grad_norm": 1.8721305131912231, + "learning_rate": 3.285290377278508e-07, + "loss": 0.5448, + "mean_token_accuracy": 0.8254424333572388, + "num_tokens": 29515326.0, + "step": 776 + }, + { + "epoch": 0.09884238646482636, + "grad_norm": 1.5839301347732544, + "learning_rate": 3.2895294616362864e-07, + "loss": 0.5002, + "mean_token_accuracy": 0.8408381938934326, + "num_tokens": 29554117.0, + "step": 777 + }, + { + "epoch": 0.09896959674341686, + "grad_norm": 1.549255132675171, + "learning_rate": 3.293768545994065e-07, + "loss": 0.4582, + "mean_token_accuracy": 0.8478283882141113, + "num_tokens": 29592413.0, + "step": 778 + }, + { + "epoch": 0.09909680702200738, + "grad_norm": 1.607154369354248, + "learning_rate": 3.298007630351844e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.8373349905014038, + "num_tokens": 29631574.0, + "step": 779 + }, + { + "epoch": 0.09922401730059789, + "grad_norm": 1.6498360633850098, + "learning_rate": 3.302246714709623e-07, + "loss": 0.4719, + "mean_token_accuracy": 0.8425261378288269, + "num_tokens": 29669099.0, + "step": 780 + }, + { + "epoch": 0.09935122757918839, + "grad_norm": 1.6357256174087524, + "learning_rate": 3.3064857990674013e-07, + "loss": 0.5201, + "mean_token_accuracy": 0.8317707777023315, + "num_tokens": 29708489.0, + "step": 781 + }, + { + "epoch": 0.0994784378577789, + "grad_norm": 1.6513773202896118, + "learning_rate": 3.31072488342518e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8420281410217285, + "num_tokens": 29743586.0, + "step": 782 + }, + { + "epoch": 0.09960564813636942, + "grad_norm": 1.7949676513671875, + "learning_rate": 3.314963967782959e-07, + "loss": 0.5168, + "mean_token_accuracy": 0.831636905670166, + "num_tokens": 29778977.0, + "step": 783 + }, + { + "epoch": 0.09973285841495994, + "grad_norm": 1.6052296161651611, + "learning_rate": 3.319203052140738e-07, + "loss": 0.5332, + "mean_token_accuracy": 0.8275814056396484, + "num_tokens": 29819567.0, + "step": 784 + }, + { + "epoch": 0.09986006869355044, + "grad_norm": 1.6445212364196777, + "learning_rate": 3.323442136498516e-07, + "loss": 0.474, + "mean_token_accuracy": 0.8452223539352417, + "num_tokens": 29855404.0, + "step": 785 + }, + { + "epoch": 0.09998727897214095, + "grad_norm": 1.6731892824172974, + "learning_rate": 3.3276812208562947e-07, + "loss": 0.5129, + "mean_token_accuracy": 0.8326588273048401, + "num_tokens": 29895262.0, + "step": 786 + }, + { + "epoch": 0.10011448925073146, + "grad_norm": 1.5425739288330078, + "learning_rate": 3.3319203052140737e-07, + "loss": 0.4709, + "mean_token_accuracy": 0.8452913761138916, + "num_tokens": 29936018.0, + "step": 787 + }, + { + "epoch": 0.10024169952932196, + "grad_norm": 1.7588223218917847, + "learning_rate": 3.336159389571852e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8384056091308594, + "num_tokens": 29969779.0, + "step": 788 + }, + { + "epoch": 0.10036890980791248, + "grad_norm": 1.7677689790725708, + "learning_rate": 3.340398473929631e-07, + "loss": 0.5752, + "mean_token_accuracy": 0.8196055293083191, + "num_tokens": 30008716.0, + "step": 789 + }, + { + "epoch": 0.100496120086503, + "grad_norm": 1.5408018827438354, + "learning_rate": 3.3446375582874096e-07, + "loss": 0.5035, + "mean_token_accuracy": 0.8350340127944946, + "num_tokens": 30052524.0, + "step": 790 + }, + { + "epoch": 0.1006233303650935, + "grad_norm": 1.7281699180603027, + "learning_rate": 3.3488766426451886e-07, + "loss": 0.4597, + "mean_token_accuracy": 0.8506964445114136, + "num_tokens": 30090227.0, + "step": 791 + }, + { + "epoch": 0.10075054064368401, + "grad_norm": 1.6690402030944824, + "learning_rate": 3.353115727002967e-07, + "loss": 0.478, + "mean_token_accuracy": 0.8441179394721985, + "num_tokens": 30127727.0, + "step": 792 + }, + { + "epoch": 0.10087775092227452, + "grad_norm": 1.682363510131836, + "learning_rate": 3.357354811360746e-07, + "loss": 0.5003, + "mean_token_accuracy": 0.8360433578491211, + "num_tokens": 30166291.0, + "step": 793 + }, + { + "epoch": 0.10100496120086502, + "grad_norm": 1.6330617666244507, + "learning_rate": 3.3615938957185245e-07, + "loss": 0.5014, + "mean_token_accuracy": 0.83580482006073, + "num_tokens": 30205099.0, + "step": 794 + }, + { + "epoch": 0.10113217147945554, + "grad_norm": 1.6817609071731567, + "learning_rate": 3.3658329800763035e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.8457059264183044, + "num_tokens": 30243999.0, + "step": 795 + }, + { + "epoch": 0.10125938175804605, + "grad_norm": 1.7788722515106201, + "learning_rate": 3.370072064434082e-07, + "loss": 0.5362, + "mean_token_accuracy": 0.8232762217521667, + "num_tokens": 30280537.0, + "step": 796 + }, + { + "epoch": 0.10138659203663657, + "grad_norm": 1.7842199802398682, + "learning_rate": 3.374311148791861e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.8409927487373352, + "num_tokens": 30314369.0, + "step": 797 + }, + { + "epoch": 0.10151380231522707, + "grad_norm": 1.5854090452194214, + "learning_rate": 3.3785502331496394e-07, + "loss": 0.5107, + "mean_token_accuracy": 0.8367894291877747, + "num_tokens": 30357370.0, + "step": 798 + }, + { + "epoch": 0.10164101259381758, + "grad_norm": 1.5689274072647095, + "learning_rate": 3.3827893175074184e-07, + "loss": 0.5292, + "mean_token_accuracy": 0.8282128572463989, + "num_tokens": 30400914.0, + "step": 799 + }, + { + "epoch": 0.1017682228724081, + "grad_norm": 1.5930864810943604, + "learning_rate": 3.387028401865197e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.8425707221031189, + "num_tokens": 30438508.0, + "step": 800 + }, + { + "epoch": 0.1018954331509986, + "grad_norm": 1.5603519678115845, + "learning_rate": 3.391267486222976e-07, + "loss": 0.5015, + "mean_token_accuracy": 0.8384552597999573, + "num_tokens": 30479011.0, + "step": 801 + }, + { + "epoch": 0.10202264342958911, + "grad_norm": 1.4102294445037842, + "learning_rate": 3.3955065705807543e-07, + "loss": 0.4365, + "mean_token_accuracy": 0.8569283485412598, + "num_tokens": 30521760.0, + "step": 802 + }, + { + "epoch": 0.10214985370817962, + "grad_norm": 1.5422027111053467, + "learning_rate": 3.3997456549385333e-07, + "loss": 0.4987, + "mean_token_accuracy": 0.8377341628074646, + "num_tokens": 30561529.0, + "step": 803 + }, + { + "epoch": 0.10227706398677013, + "grad_norm": 1.6339287757873535, + "learning_rate": 3.403984739296312e-07, + "loss": 0.5339, + "mean_token_accuracy": 0.8261802196502686, + "num_tokens": 30601510.0, + "step": 804 + }, + { + "epoch": 0.10240427426536064, + "grad_norm": 1.5963867902755737, + "learning_rate": 3.408223823654091e-07, + "loss": 0.4592, + "mean_token_accuracy": 0.8485316038131714, + "num_tokens": 30644869.0, + "step": 805 + }, + { + "epoch": 0.10253148454395115, + "grad_norm": 1.5908527374267578, + "learning_rate": 3.412462908011869e-07, + "loss": 0.4624, + "mean_token_accuracy": 0.8499518036842346, + "num_tokens": 30682325.0, + "step": 806 + }, + { + "epoch": 0.10265869482254165, + "grad_norm": 1.741020679473877, + "learning_rate": 3.4167019923696477e-07, + "loss": 0.5156, + "mean_token_accuracy": 0.8355150818824768, + "num_tokens": 30721799.0, + "step": 807 + }, + { + "epoch": 0.10278590510113217, + "grad_norm": 1.8477615118026733, + "learning_rate": 3.4209410767274267e-07, + "loss": 0.5474, + "mean_token_accuracy": 0.8200539350509644, + "num_tokens": 30758490.0, + "step": 808 + }, + { + "epoch": 0.10291311537972268, + "grad_norm": 1.751399040222168, + "learning_rate": 3.4251801610852057e-07, + "loss": 0.5309, + "mean_token_accuracy": 0.8260117769241333, + "num_tokens": 30800115.0, + "step": 809 + }, + { + "epoch": 0.1030403256583132, + "grad_norm": 1.7376612424850464, + "learning_rate": 3.429419245442984e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.8399513363838196, + "num_tokens": 30836829.0, + "step": 810 + }, + { + "epoch": 0.1031675359369037, + "grad_norm": 1.644469976425171, + "learning_rate": 3.4336583298007626e-07, + "loss": 0.4499, + "mean_token_accuracy": 0.8529092669487, + "num_tokens": 30874884.0, + "step": 811 + }, + { + "epoch": 0.10329474621549421, + "grad_norm": 1.9505537748336792, + "learning_rate": 3.4378974141585416e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.8365245461463928, + "num_tokens": 30903944.0, + "step": 812 + }, + { + "epoch": 0.10342195649408473, + "grad_norm": 1.639350175857544, + "learning_rate": 3.4421364985163206e-07, + "loss": 0.4606, + "mean_token_accuracy": 0.8500406742095947, + "num_tokens": 30939892.0, + "step": 813 + }, + { + "epoch": 0.10354916677267523, + "grad_norm": 1.5294294357299805, + "learning_rate": 3.446375582874099e-07, + "loss": 0.4719, + "mean_token_accuracy": 0.8452306985855103, + "num_tokens": 30980055.0, + "step": 814 + }, + { + "epoch": 0.10367637705126574, + "grad_norm": 1.7756242752075195, + "learning_rate": 3.4506146672318775e-07, + "loss": 0.5448, + "mean_token_accuracy": 0.8241187334060669, + "num_tokens": 31017669.0, + "step": 815 + }, + { + "epoch": 0.10380358732985626, + "grad_norm": 1.5355161428451538, + "learning_rate": 3.4548537515896565e-07, + "loss": 0.4814, + "mean_token_accuracy": 0.8383426070213318, + "num_tokens": 31056738.0, + "step": 816 + }, + { + "epoch": 0.10393079760844676, + "grad_norm": 1.6505920886993408, + "learning_rate": 3.4590928359474355e-07, + "loss": 0.5238, + "mean_token_accuracy": 0.8320659399032593, + "num_tokens": 31094718.0, + "step": 817 + }, + { + "epoch": 0.10405800788703727, + "grad_norm": 1.5822492837905884, + "learning_rate": 3.463331920305214e-07, + "loss": 0.4952, + "mean_token_accuracy": 0.8383364677429199, + "num_tokens": 31135246.0, + "step": 818 + }, + { + "epoch": 0.10418521816562779, + "grad_norm": 1.630651831626892, + "learning_rate": 3.4675710046629924e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8437071442604065, + "num_tokens": 31171724.0, + "step": 819 + }, + { + "epoch": 0.1043124284442183, + "grad_norm": 1.8421932458877563, + "learning_rate": 3.4718100890207714e-07, + "loss": 0.4738, + "mean_token_accuracy": 0.847665548324585, + "num_tokens": 31204427.0, + "step": 820 + }, + { + "epoch": 0.1044396387228088, + "grad_norm": 1.6814640760421753, + "learning_rate": 3.4760491733785504e-07, + "loss": 0.4893, + "mean_token_accuracy": 0.843942403793335, + "num_tokens": 31241246.0, + "step": 821 + }, + { + "epoch": 0.10456684900139931, + "grad_norm": 1.6606056690216064, + "learning_rate": 3.480288257736329e-07, + "loss": 0.5892, + "mean_token_accuracy": 0.8133302330970764, + "num_tokens": 31285802.0, + "step": 822 + }, + { + "epoch": 0.10469405927998983, + "grad_norm": 1.5545094013214111, + "learning_rate": 3.4845273420941073e-07, + "loss": 0.5196, + "mean_token_accuracy": 0.8326601982116699, + "num_tokens": 31327133.0, + "step": 823 + }, + { + "epoch": 0.10482126955858033, + "grad_norm": 1.7037277221679688, + "learning_rate": 3.4887664264518863e-07, + "loss": 0.511, + "mean_token_accuracy": 0.8382314443588257, + "num_tokens": 31363816.0, + "step": 824 + }, + { + "epoch": 0.10494847983717084, + "grad_norm": 1.7256908416748047, + "learning_rate": 3.4930055108096653e-07, + "loss": 0.5039, + "mean_token_accuracy": 0.8389902114868164, + "num_tokens": 31398311.0, + "step": 825 + }, + { + "epoch": 0.10507569011576136, + "grad_norm": 1.5885099172592163, + "learning_rate": 3.497244595167443e-07, + "loss": 0.5191, + "mean_token_accuracy": 0.8307480812072754, + "num_tokens": 31444402.0, + "step": 826 + }, + { + "epoch": 0.10520290039435186, + "grad_norm": 1.5506612062454224, + "learning_rate": 3.501483679525222e-07, + "loss": 0.496, + "mean_token_accuracy": 0.8431943655014038, + "num_tokens": 31482491.0, + "step": 827 + }, + { + "epoch": 0.10533011067294237, + "grad_norm": 1.420167088508606, + "learning_rate": 3.505722763883001e-07, + "loss": 0.4728, + "mean_token_accuracy": 0.845245897769928, + "num_tokens": 31528083.0, + "step": 828 + }, + { + "epoch": 0.10545732095153289, + "grad_norm": 1.592381238937378, + "learning_rate": 3.50996184824078e-07, + "loss": 0.4693, + "mean_token_accuracy": 0.8458788394927979, + "num_tokens": 31566502.0, + "step": 829 + }, + { + "epoch": 0.10558453123012339, + "grad_norm": 1.6516189575195312, + "learning_rate": 3.514200932598558e-07, + "loss": 0.5328, + "mean_token_accuracy": 0.8271012306213379, + "num_tokens": 31603289.0, + "step": 830 + }, + { + "epoch": 0.1057117415087139, + "grad_norm": 1.5013617277145386, + "learning_rate": 3.518440016956337e-07, + "loss": 0.4433, + "mean_token_accuracy": 0.8554024696350098, + "num_tokens": 31642843.0, + "step": 831 + }, + { + "epoch": 0.10583895178730442, + "grad_norm": 1.656402587890625, + "learning_rate": 3.522679101314116e-07, + "loss": 0.5018, + "mean_token_accuracy": 0.8359628915786743, + "num_tokens": 31678703.0, + "step": 832 + }, + { + "epoch": 0.10596616206589493, + "grad_norm": 1.644974946975708, + "learning_rate": 3.526918185671895e-07, + "loss": 0.4823, + "mean_token_accuracy": 0.8435934782028198, + "num_tokens": 31716275.0, + "step": 833 + }, + { + "epoch": 0.10609337234448543, + "grad_norm": 1.531743049621582, + "learning_rate": 3.531157270029673e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.8375062942504883, + "num_tokens": 31760616.0, + "step": 834 + }, + { + "epoch": 0.10622058262307595, + "grad_norm": 1.7654922008514404, + "learning_rate": 3.535396354387452e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.8407360315322876, + "num_tokens": 31796061.0, + "step": 835 + }, + { + "epoch": 0.10634779290166646, + "grad_norm": 1.735142469406128, + "learning_rate": 3.539635438745231e-07, + "loss": 0.5192, + "mean_token_accuracy": 0.8317252397537231, + "num_tokens": 31839527.0, + "step": 836 + }, + { + "epoch": 0.10647500318025696, + "grad_norm": 1.7443820238113403, + "learning_rate": 3.54387452310301e-07, + "loss": 0.5693, + "mean_token_accuracy": 0.8155525922775269, + "num_tokens": 31881570.0, + "step": 837 + }, + { + "epoch": 0.10660221345884748, + "grad_norm": 1.6645680665969849, + "learning_rate": 3.548113607460788e-07, + "loss": 0.4611, + "mean_token_accuracy": 0.8504593372344971, + "num_tokens": 31917952.0, + "step": 838 + }, + { + "epoch": 0.10672942373743799, + "grad_norm": 1.5448596477508545, + "learning_rate": 3.552352691818567e-07, + "loss": 0.4925, + "mean_token_accuracy": 0.8373653888702393, + "num_tokens": 31953677.0, + "step": 839 + }, + { + "epoch": 0.10685663401602849, + "grad_norm": 1.538304090499878, + "learning_rate": 3.556591776176346e-07, + "loss": 0.4734, + "mean_token_accuracy": 0.8450150489807129, + "num_tokens": 31995277.0, + "step": 840 + }, + { + "epoch": 0.106983844294619, + "grad_norm": 1.4660587310791016, + "learning_rate": 3.560830860534125e-07, + "loss": 0.5033, + "mean_token_accuracy": 0.8325237035751343, + "num_tokens": 32039254.0, + "step": 841 + }, + { + "epoch": 0.10711105457320952, + "grad_norm": 1.6617501974105835, + "learning_rate": 3.565069944891903e-07, + "loss": 0.4536, + "mean_token_accuracy": 0.8492598533630371, + "num_tokens": 32076885.0, + "step": 842 + }, + { + "epoch": 0.10723826485180002, + "grad_norm": 1.6409131288528442, + "learning_rate": 3.569309029249682e-07, + "loss": 0.4313, + "mean_token_accuracy": 0.8576377034187317, + "num_tokens": 32112024.0, + "step": 843 + }, + { + "epoch": 0.10736547513039053, + "grad_norm": 1.7166157960891724, + "learning_rate": 3.573548113607461e-07, + "loss": 0.4624, + "mean_token_accuracy": 0.8478046655654907, + "num_tokens": 32147895.0, + "step": 844 + }, + { + "epoch": 0.10749268540898105, + "grad_norm": 1.4986670017242432, + "learning_rate": 3.577787197965239e-07, + "loss": 0.4525, + "mean_token_accuracy": 0.8492968082427979, + "num_tokens": 32189082.0, + "step": 845 + }, + { + "epoch": 0.10761989568757156, + "grad_norm": 1.5927025079727173, + "learning_rate": 3.5820262823230177e-07, + "loss": 0.465, + "mean_token_accuracy": 0.8480266332626343, + "num_tokens": 32226995.0, + "step": 846 + }, + { + "epoch": 0.10774710596616206, + "grad_norm": 1.6353600025177002, + "learning_rate": 3.5862653666807967e-07, + "loss": 0.5294, + "mean_token_accuracy": 0.832227349281311, + "num_tokens": 32267920.0, + "step": 847 + }, + { + "epoch": 0.10787431624475258, + "grad_norm": 1.5838931798934937, + "learning_rate": 3.5905044510385757e-07, + "loss": 0.5496, + "mean_token_accuracy": 0.8241367340087891, + "num_tokens": 32309123.0, + "step": 848 + }, + { + "epoch": 0.10800152652334309, + "grad_norm": 1.5725152492523193, + "learning_rate": 3.594743535396354e-07, + "loss": 0.4546, + "mean_token_accuracy": 0.8495245575904846, + "num_tokens": 32347550.0, + "step": 849 + }, + { + "epoch": 0.10812873680193359, + "grad_norm": 1.641298532485962, + "learning_rate": 3.5989826197541326e-07, + "loss": 0.4824, + "mean_token_accuracy": 0.8428032398223877, + "num_tokens": 32387881.0, + "step": 850 + }, + { + "epoch": 0.1082559470805241, + "grad_norm": 1.6329760551452637, + "learning_rate": 3.6032217041119116e-07, + "loss": 0.5189, + "mean_token_accuracy": 0.8356874585151672, + "num_tokens": 32429570.0, + "step": 851 + }, + { + "epoch": 0.10838315735911462, + "grad_norm": 1.4167503118515015, + "learning_rate": 3.6074607884696906e-07, + "loss": 0.4898, + "mean_token_accuracy": 0.8404685258865356, + "num_tokens": 32480079.0, + "step": 852 + }, + { + "epoch": 0.10851036763770512, + "grad_norm": 1.6725711822509766, + "learning_rate": 3.611699872827469e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.8381780385971069, + "num_tokens": 32514125.0, + "step": 853 + }, + { + "epoch": 0.10863757791629564, + "grad_norm": 1.57291579246521, + "learning_rate": 3.6159389571852475e-07, + "loss": 0.4851, + "mean_token_accuracy": 0.8425382375717163, + "num_tokens": 32555673.0, + "step": 854 + }, + { + "epoch": 0.10876478819488615, + "grad_norm": 1.7456282377243042, + "learning_rate": 3.6201780415430265e-07, + "loss": 0.5538, + "mean_token_accuracy": 0.8295837640762329, + "num_tokens": 32593259.0, + "step": 855 + }, + { + "epoch": 0.10889199847347665, + "grad_norm": 1.6077210903167725, + "learning_rate": 3.6244171259008055e-07, + "loss": 0.4468, + "mean_token_accuracy": 0.8502312302589417, + "num_tokens": 32629954.0, + "step": 856 + }, + { + "epoch": 0.10901920875206716, + "grad_norm": 1.5825202465057373, + "learning_rate": 3.628656210258584e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.8453472852706909, + "num_tokens": 32667202.0, + "step": 857 + }, + { + "epoch": 0.10914641903065768, + "grad_norm": 1.7439923286437988, + "learning_rate": 3.6328952946163624e-07, + "loss": 0.5176, + "mean_token_accuracy": 0.8312586545944214, + "num_tokens": 32702360.0, + "step": 858 + }, + { + "epoch": 0.1092736293092482, + "grad_norm": 1.599016547203064, + "learning_rate": 3.6371343789741414e-07, + "loss": 0.5313, + "mean_token_accuracy": 0.8290987014770508, + "num_tokens": 32742817.0, + "step": 859 + }, + { + "epoch": 0.1094008395878387, + "grad_norm": 1.6528903245925903, + "learning_rate": 3.6413734633319204e-07, + "loss": 0.4706, + "mean_token_accuracy": 0.8442507982254028, + "num_tokens": 32778534.0, + "step": 860 + }, + { + "epoch": 0.10952804986642921, + "grad_norm": 1.6264103651046753, + "learning_rate": 3.645612547689699e-07, + "loss": 0.4731, + "mean_token_accuracy": 0.840825080871582, + "num_tokens": 32813486.0, + "step": 861 + }, + { + "epoch": 0.10965526014501972, + "grad_norm": 1.6361632347106934, + "learning_rate": 3.6498516320474773e-07, + "loss": 0.4565, + "mean_token_accuracy": 0.8490341901779175, + "num_tokens": 32847192.0, + "step": 862 + }, + { + "epoch": 0.10978247042361022, + "grad_norm": 1.5180950164794922, + "learning_rate": 3.6540907164052563e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.8377274870872498, + "num_tokens": 32888447.0, + "step": 863 + }, + { + "epoch": 0.10990968070220074, + "grad_norm": 1.7376149892807007, + "learning_rate": 3.658329800763035e-07, + "loss": 0.5723, + "mean_token_accuracy": 0.814906120300293, + "num_tokens": 32926684.0, + "step": 864 + }, + { + "epoch": 0.11003689098079125, + "grad_norm": 1.5330334901809692, + "learning_rate": 3.662568885120814e-07, + "loss": 0.5044, + "mean_token_accuracy": 0.836479902267456, + "num_tokens": 32968688.0, + "step": 865 + }, + { + "epoch": 0.11016410125938175, + "grad_norm": 1.873520016670227, + "learning_rate": 3.666807969478592e-07, + "loss": 0.5278, + "mean_token_accuracy": 0.8242615461349487, + "num_tokens": 33000176.0, + "step": 866 + }, + { + "epoch": 0.11029131153797227, + "grad_norm": 1.5442867279052734, + "learning_rate": 3.671047053836371e-07, + "loss": 0.5022, + "mean_token_accuracy": 0.8363595008850098, + "num_tokens": 33042328.0, + "step": 867 + }, + { + "epoch": 0.11041852181656278, + "grad_norm": 1.7439372539520264, + "learning_rate": 3.6752861381941497e-07, + "loss": 0.4896, + "mean_token_accuracy": 0.8399072885513306, + "num_tokens": 33076161.0, + "step": 868 + }, + { + "epoch": 0.11054573209515328, + "grad_norm": 1.9114590883255005, + "learning_rate": 3.6795252225519287e-07, + "loss": 0.5111, + "mean_token_accuracy": 0.8292959928512573, + "num_tokens": 33105109.0, + "step": 869 + }, + { + "epoch": 0.1106729423737438, + "grad_norm": 1.6532490253448486, + "learning_rate": 3.6837643069097077e-07, + "loss": 0.5166, + "mean_token_accuracy": 0.8337019085884094, + "num_tokens": 33141566.0, + "step": 870 + }, + { + "epoch": 0.11080015265233431, + "grad_norm": 1.6377888917922974, + "learning_rate": 3.688003391267486e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.8402984738349915, + "num_tokens": 33177051.0, + "step": 871 + }, + { + "epoch": 0.11092736293092482, + "grad_norm": 1.7747046947479248, + "learning_rate": 3.6922424756252646e-07, + "loss": 0.5195, + "mean_token_accuracy": 0.8283834457397461, + "num_tokens": 33208346.0, + "step": 872 + }, + { + "epoch": 0.11105457320951533, + "grad_norm": 1.5178837776184082, + "learning_rate": 3.6964815599830436e-07, + "loss": 0.4269, + "mean_token_accuracy": 0.8565782308578491, + "num_tokens": 33247785.0, + "step": 873 + }, + { + "epoch": 0.11118178348810584, + "grad_norm": 1.6624417304992676, + "learning_rate": 3.7007206443408226e-07, + "loss": 0.4654, + "mean_token_accuracy": 0.8451917171478271, + "num_tokens": 33285002.0, + "step": 874 + }, + { + "epoch": 0.11130899376669635, + "grad_norm": 1.5402181148529053, + "learning_rate": 3.704959728698601e-07, + "loss": 0.4755, + "mean_token_accuracy": 0.8481322526931763, + "num_tokens": 33325825.0, + "step": 875 + }, + { + "epoch": 0.11143620404528685, + "grad_norm": 1.7226595878601074, + "learning_rate": 3.7091988130563795e-07, + "loss": 0.4765, + "mean_token_accuracy": 0.842848539352417, + "num_tokens": 33361271.0, + "step": 876 + }, + { + "epoch": 0.11156341432387737, + "grad_norm": 1.7912172079086304, + "learning_rate": 3.7134378974141585e-07, + "loss": 0.4903, + "mean_token_accuracy": 0.8373481631278992, + "num_tokens": 33392419.0, + "step": 877 + }, + { + "epoch": 0.11169062460246788, + "grad_norm": 1.6297410726547241, + "learning_rate": 3.7176769817719375e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.8427643775939941, + "num_tokens": 33430207.0, + "step": 878 + }, + { + "epoch": 0.11181783488105838, + "grad_norm": 1.4834520816802979, + "learning_rate": 3.7219160661297154e-07, + "loss": 0.4784, + "mean_token_accuracy": 0.844527542591095, + "num_tokens": 33474098.0, + "step": 879 + }, + { + "epoch": 0.1119450451596489, + "grad_norm": 1.665469765663147, + "learning_rate": 3.7261551504874944e-07, + "loss": 0.4954, + "mean_token_accuracy": 0.8381124138832092, + "num_tokens": 33509282.0, + "step": 880 + }, + { + "epoch": 0.11207225543823941, + "grad_norm": 1.81473708152771, + "learning_rate": 3.7303942348452734e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8444326519966125, + "num_tokens": 33542981.0, + "step": 881 + }, + { + "epoch": 0.11219946571682991, + "grad_norm": 1.537702202796936, + "learning_rate": 3.7346333192030524e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.8474751114845276, + "num_tokens": 33583622.0, + "step": 882 + }, + { + "epoch": 0.11232667599542043, + "grad_norm": 1.5808025598526, + "learning_rate": 3.7388724035608303e-07, + "loss": 0.4714, + "mean_token_accuracy": 0.8468503952026367, + "num_tokens": 33621579.0, + "step": 883 + }, + { + "epoch": 0.11245388627401094, + "grad_norm": 1.6357190608978271, + "learning_rate": 3.7431114879186093e-07, + "loss": 0.4587, + "mean_token_accuracy": 0.8527389764785767, + "num_tokens": 33657725.0, + "step": 884 + }, + { + "epoch": 0.11258109655260146, + "grad_norm": 1.7708200216293335, + "learning_rate": 3.7473505722763883e-07, + "loss": 0.4701, + "mean_token_accuracy": 0.844271183013916, + "num_tokens": 33695663.0, + "step": 885 + }, + { + "epoch": 0.11270830683119196, + "grad_norm": 1.6808347702026367, + "learning_rate": 3.7515896566341673e-07, + "loss": 0.4564, + "mean_token_accuracy": 0.848204493522644, + "num_tokens": 33729708.0, + "step": 886 + }, + { + "epoch": 0.11283551710978247, + "grad_norm": 1.6560148000717163, + "learning_rate": 3.755828740991945e-07, + "loss": 0.4386, + "mean_token_accuracy": 0.8569492101669312, + "num_tokens": 33768168.0, + "step": 887 + }, + { + "epoch": 0.11296272738837299, + "grad_norm": 1.4987810850143433, + "learning_rate": 3.760067825349724e-07, + "loss": 0.517, + "mean_token_accuracy": 0.8328081965446472, + "num_tokens": 33813920.0, + "step": 888 + }, + { + "epoch": 0.11308993766696349, + "grad_norm": 1.5650807619094849, + "learning_rate": 3.764306909707503e-07, + "loss": 0.467, + "mean_token_accuracy": 0.8418620228767395, + "num_tokens": 33851252.0, + "step": 889 + }, + { + "epoch": 0.113217147945554, + "grad_norm": 1.7160484790802002, + "learning_rate": 3.768545994065282e-07, + "loss": 0.4854, + "mean_token_accuracy": 0.8386446833610535, + "num_tokens": 33884951.0, + "step": 890 + }, + { + "epoch": 0.11334435822414451, + "grad_norm": 1.6764038801193237, + "learning_rate": 3.77278507842306e-07, + "loss": 0.4997, + "mean_token_accuracy": 0.8385401964187622, + "num_tokens": 33920236.0, + "step": 891 + }, + { + "epoch": 0.11347156850273502, + "grad_norm": 1.5626587867736816, + "learning_rate": 3.777024162780839e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.8419647216796875, + "num_tokens": 33960370.0, + "step": 892 + }, + { + "epoch": 0.11359877878132553, + "grad_norm": 1.5727214813232422, + "learning_rate": 3.781263247138618e-07, + "loss": 0.5169, + "mean_token_accuracy": 0.830757200717926, + "num_tokens": 34000008.0, + "step": 893 + }, + { + "epoch": 0.11372598905991604, + "grad_norm": 1.6355361938476562, + "learning_rate": 3.785502331496397e-07, + "loss": 0.4907, + "mean_token_accuracy": 0.8391983509063721, + "num_tokens": 34037582.0, + "step": 894 + }, + { + "epoch": 0.11385319933850654, + "grad_norm": 1.7130693197250366, + "learning_rate": 3.789741415854175e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8462049961090088, + "num_tokens": 34070901.0, + "step": 895 + }, + { + "epoch": 0.11398040961709706, + "grad_norm": 1.4890128374099731, + "learning_rate": 3.793980500211954e-07, + "loss": 0.4289, + "mean_token_accuracy": 0.856572151184082, + "num_tokens": 34112775.0, + "step": 896 + }, + { + "epoch": 0.11410761989568757, + "grad_norm": 1.4541000127792358, + "learning_rate": 3.798219584569733e-07, + "loss": 0.5007, + "mean_token_accuracy": 0.8370314836502075, + "num_tokens": 34157309.0, + "step": 897 + }, + { + "epoch": 0.11423483017427809, + "grad_norm": 1.6695817708969116, + "learning_rate": 3.8024586689275115e-07, + "loss": 0.469, + "mean_token_accuracy": 0.8432722687721252, + "num_tokens": 34192201.0, + "step": 898 + }, + { + "epoch": 0.11436204045286859, + "grad_norm": 1.4512555599212646, + "learning_rate": 3.80669775328529e-07, + "loss": 0.4555, + "mean_token_accuracy": 0.8487367630004883, + "num_tokens": 34233324.0, + "step": 899 + }, + { + "epoch": 0.1144892507314591, + "grad_norm": 1.653884768486023, + "learning_rate": 3.810936837643069e-07, + "loss": 0.504, + "mean_token_accuracy": 0.8356153964996338, + "num_tokens": 34273863.0, + "step": 900 + }, + { + "epoch": 0.11461646101004962, + "grad_norm": 1.5885525941848755, + "learning_rate": 3.815175922000848e-07, + "loss": 0.4703, + "mean_token_accuracy": 0.8414428234100342, + "num_tokens": 34312381.0, + "step": 901 + }, + { + "epoch": 0.11474367128864012, + "grad_norm": 1.6487045288085938, + "learning_rate": 3.8194150063586264e-07, + "loss": 0.4783, + "mean_token_accuracy": 0.8437884449958801, + "num_tokens": 34347552.0, + "step": 902 + }, + { + "epoch": 0.11487088156723063, + "grad_norm": 1.505063772201538, + "learning_rate": 3.823654090716405e-07, + "loss": 0.5183, + "mean_token_accuracy": 0.8316137790679932, + "num_tokens": 34389417.0, + "step": 903 + }, + { + "epoch": 0.11499809184582115, + "grad_norm": 1.6166194677352905, + "learning_rate": 3.827893175074184e-07, + "loss": 0.4453, + "mean_token_accuracy": 0.8534427881240845, + "num_tokens": 34424833.0, + "step": 904 + }, + { + "epoch": 0.11512530212441165, + "grad_norm": 1.7519619464874268, + "learning_rate": 3.832132259431963e-07, + "loss": 0.5205, + "mean_token_accuracy": 0.8330557346343994, + "num_tokens": 34462012.0, + "step": 905 + }, + { + "epoch": 0.11525251240300216, + "grad_norm": 1.6106168031692505, + "learning_rate": 3.8363713437897413e-07, + "loss": 0.5208, + "mean_token_accuracy": 0.8276973962783813, + "num_tokens": 34500700.0, + "step": 906 + }, + { + "epoch": 0.11537972268159268, + "grad_norm": 1.6643677949905396, + "learning_rate": 3.8406104281475197e-07, + "loss": 0.4606, + "mean_token_accuracy": 0.8452144861221313, + "num_tokens": 34536977.0, + "step": 907 + }, + { + "epoch": 0.11550693296018319, + "grad_norm": 1.584021806716919, + "learning_rate": 3.8448495125052987e-07, + "loss": 0.436, + "mean_token_accuracy": 0.8559245467185974, + "num_tokens": 34574077.0, + "step": 908 + }, + { + "epoch": 0.11563414323877369, + "grad_norm": 1.550539493560791, + "learning_rate": 3.8490885968630777e-07, + "loss": 0.4802, + "mean_token_accuracy": 0.8455666303634644, + "num_tokens": 34615597.0, + "step": 909 + }, + { + "epoch": 0.1157613535173642, + "grad_norm": 1.7740161418914795, + "learning_rate": 3.853327681220856e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.8408650159835815, + "num_tokens": 34649956.0, + "step": 910 + }, + { + "epoch": 0.11588856379595472, + "grad_norm": 1.6448227167129517, + "learning_rate": 3.8575667655786346e-07, + "loss": 0.4343, + "mean_token_accuracy": 0.8555866479873657, + "num_tokens": 34683399.0, + "step": 911 + }, + { + "epoch": 0.11601577407454522, + "grad_norm": 1.6763395071029663, + "learning_rate": 3.8618058499364136e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.8422690033912659, + "num_tokens": 34718660.0, + "step": 912 + }, + { + "epoch": 0.11614298435313573, + "grad_norm": 1.532333493232727, + "learning_rate": 3.8660449342941926e-07, + "loss": 0.4701, + "mean_token_accuracy": 0.8484011292457581, + "num_tokens": 34759209.0, + "step": 913 + }, + { + "epoch": 0.11627019463172625, + "grad_norm": 1.6202601194381714, + "learning_rate": 3.870284018651971e-07, + "loss": 0.4979, + "mean_token_accuracy": 0.837698221206665, + "num_tokens": 34797953.0, + "step": 914 + }, + { + "epoch": 0.11639740491031675, + "grad_norm": 1.8276526927947998, + "learning_rate": 3.8745231030097495e-07, + "loss": 0.467, + "mean_token_accuracy": 0.8470228910446167, + "num_tokens": 34827513.0, + "step": 915 + }, + { + "epoch": 0.11652461518890726, + "grad_norm": 1.471224308013916, + "learning_rate": 3.8787621873675285e-07, + "loss": 0.4334, + "mean_token_accuracy": 0.862296462059021, + "num_tokens": 34867088.0, + "step": 916 + }, + { + "epoch": 0.11665182546749778, + "grad_norm": 1.5737104415893555, + "learning_rate": 3.883001271725307e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.8400610685348511, + "num_tokens": 34906581.0, + "step": 917 + }, + { + "epoch": 0.11677903574608828, + "grad_norm": 1.5765682458877563, + "learning_rate": 3.887240356083086e-07, + "loss": 0.5025, + "mean_token_accuracy": 0.8389139175415039, + "num_tokens": 34946378.0, + "step": 918 + }, + { + "epoch": 0.11690624602467879, + "grad_norm": 1.4919873476028442, + "learning_rate": 3.8914794404408644e-07, + "loss": 0.5035, + "mean_token_accuracy": 0.834770679473877, + "num_tokens": 34990477.0, + "step": 919 + }, + { + "epoch": 0.1170334563032693, + "grad_norm": 1.7367171049118042, + "learning_rate": 3.8957185247986434e-07, + "loss": 0.4772, + "mean_token_accuracy": 0.8434979915618896, + "num_tokens": 35023145.0, + "step": 920 + }, + { + "epoch": 0.11716066658185982, + "grad_norm": 1.8318008184432983, + "learning_rate": 3.899957609156422e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.8416553735733032, + "num_tokens": 35055653.0, + "step": 921 + }, + { + "epoch": 0.11728787686045032, + "grad_norm": 1.5695836544036865, + "learning_rate": 3.904196693514201e-07, + "loss": 0.4464, + "mean_token_accuracy": 0.8544496297836304, + "num_tokens": 35095052.0, + "step": 922 + }, + { + "epoch": 0.11741508713904084, + "grad_norm": 1.5891202688217163, + "learning_rate": 3.9084357778719793e-07, + "loss": 0.4711, + "mean_token_accuracy": 0.8434392213821411, + "num_tokens": 35134374.0, + "step": 923 + }, + { + "epoch": 0.11754229741763135, + "grad_norm": 1.6297731399536133, + "learning_rate": 3.9126748622297583e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.8433839678764343, + "num_tokens": 35171091.0, + "step": 924 + }, + { + "epoch": 0.11766950769622185, + "grad_norm": 1.801956295967102, + "learning_rate": 3.916913946587537e-07, + "loss": 0.5173, + "mean_token_accuracy": 0.8289014101028442, + "num_tokens": 35206626.0, + "step": 925 + }, + { + "epoch": 0.11779671797481236, + "grad_norm": 1.6570125818252563, + "learning_rate": 3.921153030945316e-07, + "loss": 0.4316, + "mean_token_accuracy": 0.8579950332641602, + "num_tokens": 35242871.0, + "step": 926 + }, + { + "epoch": 0.11792392825340288, + "grad_norm": 1.5452582836151123, + "learning_rate": 3.925392115303094e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.844609260559082, + "num_tokens": 35281779.0, + "step": 927 + }, + { + "epoch": 0.11805113853199338, + "grad_norm": 1.3544628620147705, + "learning_rate": 3.929631199660873e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8424955606460571, + "num_tokens": 35332590.0, + "step": 928 + }, + { + "epoch": 0.1181783488105839, + "grad_norm": 1.530834674835205, + "learning_rate": 3.9338702840186517e-07, + "loss": 0.4485, + "mean_token_accuracy": 0.851364016532898, + "num_tokens": 35370014.0, + "step": 929 + }, + { + "epoch": 0.11830555908917441, + "grad_norm": 1.70395827293396, + "learning_rate": 3.9381093683764307e-07, + "loss": 0.4518, + "mean_token_accuracy": 0.8514268398284912, + "num_tokens": 35404749.0, + "step": 930 + }, + { + "epoch": 0.11843276936776491, + "grad_norm": 1.5160155296325684, + "learning_rate": 3.942348452734209e-07, + "loss": 0.4533, + "mean_token_accuracy": 0.8546990752220154, + "num_tokens": 35443806.0, + "step": 931 + }, + { + "epoch": 0.11855997964635542, + "grad_norm": 1.7584853172302246, + "learning_rate": 3.946587537091988e-07, + "loss": 0.5189, + "mean_token_accuracy": 0.8314582109451294, + "num_tokens": 35480860.0, + "step": 932 + }, + { + "epoch": 0.11868718992494594, + "grad_norm": 1.627795696258545, + "learning_rate": 3.9508266214497666e-07, + "loss": 0.4754, + "mean_token_accuracy": 0.8423563838005066, + "num_tokens": 35518783.0, + "step": 933 + }, + { + "epoch": 0.11881440020353645, + "grad_norm": 1.659144639968872, + "learning_rate": 3.9550657058075456e-07, + "loss": 0.53, + "mean_token_accuracy": 0.8310092687606812, + "num_tokens": 35557493.0, + "step": 934 + }, + { + "epoch": 0.11894161048212695, + "grad_norm": 1.5572272539138794, + "learning_rate": 3.959304790165324e-07, + "loss": 0.5253, + "mean_token_accuracy": 0.8324481844902039, + "num_tokens": 35599692.0, + "step": 935 + }, + { + "epoch": 0.11906882076071747, + "grad_norm": 1.3763679265975952, + "learning_rate": 3.9635438745231025e-07, + "loss": 0.4583, + "mean_token_accuracy": 0.8482948541641235, + "num_tokens": 35644089.0, + "step": 936 + }, + { + "epoch": 0.11919603103930798, + "grad_norm": 1.5732526779174805, + "learning_rate": 3.9677829588808815e-07, + "loss": 0.4562, + "mean_token_accuracy": 0.8465068340301514, + "num_tokens": 35682823.0, + "step": 937 + }, + { + "epoch": 0.11932324131789848, + "grad_norm": 1.807296872138977, + "learning_rate": 3.9720220432386605e-07, + "loss": 0.5389, + "mean_token_accuracy": 0.825604259967804, + "num_tokens": 35712997.0, + "step": 938 + }, + { + "epoch": 0.119450451596489, + "grad_norm": 1.6042349338531494, + "learning_rate": 3.976261127596439e-07, + "loss": 0.4655, + "mean_token_accuracy": 0.850561261177063, + "num_tokens": 35748506.0, + "step": 939 + }, + { + "epoch": 0.11957766187507951, + "grad_norm": 1.573053002357483, + "learning_rate": 3.9805002119542174e-07, + "loss": 0.4444, + "mean_token_accuracy": 0.8568578958511353, + "num_tokens": 35786583.0, + "step": 940 + }, + { + "epoch": 0.11970487215367001, + "grad_norm": 1.5089285373687744, + "learning_rate": 3.9847392963119964e-07, + "loss": 0.4703, + "mean_token_accuracy": 0.8470027446746826, + "num_tokens": 35828710.0, + "step": 941 + }, + { + "epoch": 0.11983208243226053, + "grad_norm": 1.5801666975021362, + "learning_rate": 3.9889783806697754e-07, + "loss": 0.4998, + "mean_token_accuracy": 0.8348778486251831, + "num_tokens": 35870598.0, + "step": 942 + }, + { + "epoch": 0.11995929271085104, + "grad_norm": 1.5382288694381714, + "learning_rate": 3.993217465027554e-07, + "loss": 0.4493, + "mean_token_accuracy": 0.8521163463592529, + "num_tokens": 35907649.0, + "step": 943 + }, + { + "epoch": 0.12008650298944154, + "grad_norm": 1.559300184249878, + "learning_rate": 3.9974565493853323e-07, + "loss": 0.4494, + "mean_token_accuracy": 0.8540292978286743, + "num_tokens": 35947883.0, + "step": 944 + }, + { + "epoch": 0.12021371326803205, + "grad_norm": 1.5574853420257568, + "learning_rate": 4.0016956337431113e-07, + "loss": 0.432, + "mean_token_accuracy": 0.8604984879493713, + "num_tokens": 35986016.0, + "step": 945 + }, + { + "epoch": 0.12034092354662257, + "grad_norm": 1.6586474180221558, + "learning_rate": 4.0059347181008903e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.8458207845687866, + "num_tokens": 36021788.0, + "step": 946 + }, + { + "epoch": 0.12046813382521308, + "grad_norm": 1.5225781202316284, + "learning_rate": 4.010173802458669e-07, + "loss": 0.4459, + "mean_token_accuracy": 0.8548777103424072, + "num_tokens": 36059982.0, + "step": 947 + }, + { + "epoch": 0.12059534410380358, + "grad_norm": 1.6313494443893433, + "learning_rate": 4.014412886816447e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8450294733047485, + "num_tokens": 36099078.0, + "step": 948 + }, + { + "epoch": 0.1207225543823941, + "grad_norm": 1.600633144378662, + "learning_rate": 4.018651971174226e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.8444342613220215, + "num_tokens": 36139014.0, + "step": 949 + }, + { + "epoch": 0.12084976466098461, + "grad_norm": 1.686334252357483, + "learning_rate": 4.022891055532005e-07, + "loss": 0.4763, + "mean_token_accuracy": 0.8467740416526794, + "num_tokens": 36175819.0, + "step": 950 + }, + { + "epoch": 0.12097697493957511, + "grad_norm": 1.6313682794570923, + "learning_rate": 4.0271301398897837e-07, + "loss": 0.5209, + "mean_token_accuracy": 0.8303073644638062, + "num_tokens": 36216658.0, + "step": 951 + }, + { + "epoch": 0.12110418521816563, + "grad_norm": 1.4552470445632935, + "learning_rate": 4.031369224247562e-07, + "loss": 0.3863, + "mean_token_accuracy": 0.8722641468048096, + "num_tokens": 36256678.0, + "step": 952 + }, + { + "epoch": 0.12123139549675614, + "grad_norm": 1.4400800466537476, + "learning_rate": 4.035608308605341e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.8454450368881226, + "num_tokens": 36300676.0, + "step": 953 + }, + { + "epoch": 0.12135860577534664, + "grad_norm": 1.6500859260559082, + "learning_rate": 4.03984739296312e-07, + "loss": 0.4444, + "mean_token_accuracy": 0.8510596752166748, + "num_tokens": 36334489.0, + "step": 954 + }, + { + "epoch": 0.12148581605393716, + "grad_norm": 1.5292741060256958, + "learning_rate": 4.044086477320898e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8417552709579468, + "num_tokens": 36376781.0, + "step": 955 + }, + { + "epoch": 0.12161302633252767, + "grad_norm": 1.493704080581665, + "learning_rate": 4.048325561678677e-07, + "loss": 0.4614, + "mean_token_accuracy": 0.8489898443222046, + "num_tokens": 36422177.0, + "step": 956 + }, + { + "epoch": 0.12174023661111817, + "grad_norm": 1.5185296535491943, + "learning_rate": 4.052564646036456e-07, + "loss": 0.4397, + "mean_token_accuracy": 0.85786372423172, + "num_tokens": 36460679.0, + "step": 957 + }, + { + "epoch": 0.12186744688970869, + "grad_norm": 1.693071722984314, + "learning_rate": 4.056803730394235e-07, + "loss": 0.4845, + "mean_token_accuracy": 0.8424810171127319, + "num_tokens": 36495828.0, + "step": 958 + }, + { + "epoch": 0.1219946571682992, + "grad_norm": 1.7423908710479736, + "learning_rate": 4.061042814752013e-07, + "loss": 0.4688, + "mean_token_accuracy": 0.8480054140090942, + "num_tokens": 36527734.0, + "step": 959 + }, + { + "epoch": 0.12212186744688971, + "grad_norm": 1.542807936668396, + "learning_rate": 4.065281899109792e-07, + "loss": 0.4815, + "mean_token_accuracy": 0.8410757780075073, + "num_tokens": 36566236.0, + "step": 960 + }, + { + "epoch": 0.12224907772548022, + "grad_norm": 1.7076367139816284, + "learning_rate": 4.069520983467571e-07, + "loss": 0.4401, + "mean_token_accuracy": 0.8544141054153442, + "num_tokens": 36598307.0, + "step": 961 + }, + { + "epoch": 0.12237628800407073, + "grad_norm": 1.9313079118728638, + "learning_rate": 4.07376006782535e-07, + "loss": 0.5544, + "mean_token_accuracy": 0.8231958150863647, + "num_tokens": 36632678.0, + "step": 962 + }, + { + "epoch": 0.12250349828266124, + "grad_norm": 1.6854444742202759, + "learning_rate": 4.077999152183128e-07, + "loss": 0.5162, + "mean_token_accuracy": 0.8315836191177368, + "num_tokens": 36670034.0, + "step": 963 + }, + { + "epoch": 0.12263070856125174, + "grad_norm": 1.5693198442459106, + "learning_rate": 4.082238236540907e-07, + "loss": 0.5174, + "mean_token_accuracy": 0.8332141041755676, + "num_tokens": 36709187.0, + "step": 964 + }, + { + "epoch": 0.12275791883984226, + "grad_norm": 1.7280434370040894, + "learning_rate": 4.086477320898686e-07, + "loss": 0.4744, + "mean_token_accuracy": 0.8427718281745911, + "num_tokens": 36745507.0, + "step": 965 + }, + { + "epoch": 0.12288512911843277, + "grad_norm": 1.6947600841522217, + "learning_rate": 4.090716405256465e-07, + "loss": 0.5312, + "mean_token_accuracy": 0.8264025449752808, + "num_tokens": 36784944.0, + "step": 966 + }, + { + "epoch": 0.12301233939702327, + "grad_norm": 1.4769022464752197, + "learning_rate": 4.094955489614243e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.846587598323822, + "num_tokens": 36825729.0, + "step": 967 + }, + { + "epoch": 0.12313954967561379, + "grad_norm": 2.0865375995635986, + "learning_rate": 4.099194573972022e-07, + "loss": 0.5372, + "mean_token_accuracy": 0.825724720954895, + "num_tokens": 36854920.0, + "step": 968 + }, + { + "epoch": 0.1232667599542043, + "grad_norm": 1.6283131837844849, + "learning_rate": 4.1034336583298007e-07, + "loss": 0.5175, + "mean_token_accuracy": 0.8358774185180664, + "num_tokens": 36892552.0, + "step": 969 + }, + { + "epoch": 0.1233939702327948, + "grad_norm": 1.6086941957473755, + "learning_rate": 4.1076727426875797e-07, + "loss": 0.4625, + "mean_token_accuracy": 0.850591242313385, + "num_tokens": 36929185.0, + "step": 970 + }, + { + "epoch": 0.12352118051138532, + "grad_norm": 1.6226396560668945, + "learning_rate": 4.1119118270453577e-07, + "loss": 0.4732, + "mean_token_accuracy": 0.8440759181976318, + "num_tokens": 36964584.0, + "step": 971 + }, + { + "epoch": 0.12364839078997583, + "grad_norm": 1.5278569459915161, + "learning_rate": 4.1161509114031366e-07, + "loss": 0.4657, + "mean_token_accuracy": 0.8465268611907959, + "num_tokens": 37005493.0, + "step": 972 + }, + { + "epoch": 0.12377560106856635, + "grad_norm": 1.5117732286453247, + "learning_rate": 4.1203899957609156e-07, + "loss": 0.4476, + "mean_token_accuracy": 0.8499464988708496, + "num_tokens": 37044885.0, + "step": 973 + }, + { + "epoch": 0.12390281134715685, + "grad_norm": 1.69826340675354, + "learning_rate": 4.124629080118694e-07, + "loss": 0.4356, + "mean_token_accuracy": 0.8551647067070007, + "num_tokens": 37076794.0, + "step": 974 + }, + { + "epoch": 0.12403002162574736, + "grad_norm": 1.7044672966003418, + "learning_rate": 4.1288681644764726e-07, + "loss": 0.5221, + "mean_token_accuracy": 0.8278470039367676, + "num_tokens": 37114531.0, + "step": 975 + }, + { + "epoch": 0.12415723190433788, + "grad_norm": 1.6220673322677612, + "learning_rate": 4.1331072488342515e-07, + "loss": 0.4916, + "mean_token_accuracy": 0.8382470607757568, + "num_tokens": 37150963.0, + "step": 976 + }, + { + "epoch": 0.12428444218292838, + "grad_norm": 1.459797739982605, + "learning_rate": 4.1373463331920305e-07, + "loss": 0.4572, + "mean_token_accuracy": 0.8501843810081482, + "num_tokens": 37191466.0, + "step": 977 + }, + { + "epoch": 0.12441165246151889, + "grad_norm": 1.4082913398742676, + "learning_rate": 4.141585417549809e-07, + "loss": 0.4742, + "mean_token_accuracy": 0.8410291075706482, + "num_tokens": 37233363.0, + "step": 978 + }, + { + "epoch": 0.1245388627401094, + "grad_norm": 1.7229580879211426, + "learning_rate": 4.1458245019075875e-07, + "loss": 0.5365, + "mean_token_accuracy": 0.8271769285202026, + "num_tokens": 37268090.0, + "step": 979 + }, + { + "epoch": 0.1246660730186999, + "grad_norm": 1.5433030128479004, + "learning_rate": 4.1500635862653664e-07, + "loss": 0.4388, + "mean_token_accuracy": 0.8524841070175171, + "num_tokens": 37307091.0, + "step": 980 + }, + { + "epoch": 0.12479328329729042, + "grad_norm": 1.6217432022094727, + "learning_rate": 4.1543026706231454e-07, + "loss": 0.5122, + "mean_token_accuracy": 0.8311251997947693, + "num_tokens": 37346890.0, + "step": 981 + }, + { + "epoch": 0.12492049357588093, + "grad_norm": 1.7982988357543945, + "learning_rate": 4.158541754980924e-07, + "loss": 0.5288, + "mean_token_accuracy": 0.8293678164482117, + "num_tokens": 37376709.0, + "step": 982 + }, + { + "epoch": 0.12504770385447145, + "grad_norm": 1.5430766344070435, + "learning_rate": 4.1627808393387024e-07, + "loss": 0.4795, + "mean_token_accuracy": 0.8441639542579651, + "num_tokens": 37416180.0, + "step": 983 + }, + { + "epoch": 0.12517491413306195, + "grad_norm": 1.5461163520812988, + "learning_rate": 4.1670199236964813e-07, + "loss": 0.4334, + "mean_token_accuracy": 0.8544751405715942, + "num_tokens": 37454911.0, + "step": 984 + }, + { + "epoch": 0.12530212441165245, + "grad_norm": 1.609217643737793, + "learning_rate": 4.1712590080542603e-07, + "loss": 0.468, + "mean_token_accuracy": 0.8484106659889221, + "num_tokens": 37489316.0, + "step": 985 + }, + { + "epoch": 0.12542933469024298, + "grad_norm": 1.804740309715271, + "learning_rate": 4.175498092412039e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8360431790351868, + "num_tokens": 37520399.0, + "step": 986 + }, + { + "epoch": 0.12555654496883348, + "grad_norm": 1.5211931467056274, + "learning_rate": 4.179737176769817e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.8403990268707275, + "num_tokens": 37561078.0, + "step": 987 + }, + { + "epoch": 0.12568375524742398, + "grad_norm": 1.5693814754486084, + "learning_rate": 4.183976261127596e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.8432604074478149, + "num_tokens": 37601554.0, + "step": 988 + }, + { + "epoch": 0.1258109655260145, + "grad_norm": 1.5471320152282715, + "learning_rate": 4.1882153454853747e-07, + "loss": 0.4599, + "mean_token_accuracy": 0.8479458093643188, + "num_tokens": 37638831.0, + "step": 989 + }, + { + "epoch": 0.125938175804605, + "grad_norm": 1.5373872518539429, + "learning_rate": 4.1924544298431537e-07, + "loss": 0.5066, + "mean_token_accuracy": 0.8356397151947021, + "num_tokens": 37680738.0, + "step": 990 + }, + { + "epoch": 0.12606538608319554, + "grad_norm": 1.7139214277267456, + "learning_rate": 4.196693514200932e-07, + "loss": 0.4847, + "mean_token_accuracy": 0.8415963649749756, + "num_tokens": 37713831.0, + "step": 991 + }, + { + "epoch": 0.12619259636178604, + "grad_norm": 1.7510852813720703, + "learning_rate": 4.200932598558711e-07, + "loss": 0.5294, + "mean_token_accuracy": 0.8282293081283569, + "num_tokens": 37750902.0, + "step": 992 + }, + { + "epoch": 0.12631980664037654, + "grad_norm": 1.5931159257888794, + "learning_rate": 4.2051716829164896e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.8395728468894958, + "num_tokens": 37790206.0, + "step": 993 + }, + { + "epoch": 0.12644701691896706, + "grad_norm": 1.7097654342651367, + "learning_rate": 4.2094107672742686e-07, + "loss": 0.4327, + "mean_token_accuracy": 0.8568712472915649, + "num_tokens": 37827182.0, + "step": 994 + }, + { + "epoch": 0.12657422719755757, + "grad_norm": 1.6806981563568115, + "learning_rate": 4.2136498516320476e-07, + "loss": 0.5197, + "mean_token_accuracy": 0.8326249122619629, + "num_tokens": 37864658.0, + "step": 995 + }, + { + "epoch": 0.12670143747614807, + "grad_norm": 1.4065091609954834, + "learning_rate": 4.217888935989826e-07, + "loss": 0.4291, + "mean_token_accuracy": 0.8581339120864868, + "num_tokens": 37907516.0, + "step": 996 + }, + { + "epoch": 0.1268286477547386, + "grad_norm": 1.765845537185669, + "learning_rate": 4.2221280203476045e-07, + "loss": 0.4283, + "mean_token_accuracy": 0.8593080043792725, + "num_tokens": 37942264.0, + "step": 997 + }, + { + "epoch": 0.1269558580333291, + "grad_norm": 1.643460988998413, + "learning_rate": 4.2263671047053835e-07, + "loss": 0.4417, + "mean_token_accuracy": 0.8522094488143921, + "num_tokens": 37975882.0, + "step": 998 + }, + { + "epoch": 0.1270830683119196, + "grad_norm": 1.631158471107483, + "learning_rate": 4.2306061890631625e-07, + "loss": 0.484, + "mean_token_accuracy": 0.8415888547897339, + "num_tokens": 38012551.0, + "step": 999 + }, + { + "epoch": 0.12721027859051012, + "grad_norm": 1.5425015687942505, + "learning_rate": 4.234845273420941e-07, + "loss": 0.5234, + "mean_token_accuracy": 0.8309410810470581, + "num_tokens": 38053322.0, + "step": 1000 + }, + { + "epoch": 0.12733748886910062, + "grad_norm": 1.4419560432434082, + "learning_rate": 4.2390843577787194e-07, + "loss": 0.4732, + "mean_token_accuracy": 0.8457180261611938, + "num_tokens": 38097126.0, + "step": 1001 + }, + { + "epoch": 0.12746469914769112, + "grad_norm": 1.5838720798492432, + "learning_rate": 4.2433234421364984e-07, + "loss": 0.4398, + "mean_token_accuracy": 0.8536875247955322, + "num_tokens": 38135885.0, + "step": 1002 + }, + { + "epoch": 0.12759190942628165, + "grad_norm": 1.6570274829864502, + "learning_rate": 4.2475625264942774e-07, + "loss": 0.4836, + "mean_token_accuracy": 0.8408310413360596, + "num_tokens": 38172552.0, + "step": 1003 + }, + { + "epoch": 0.12771911970487215, + "grad_norm": 1.6537457704544067, + "learning_rate": 4.251801610852056e-07, + "loss": 0.4911, + "mean_token_accuracy": 0.8381050825119019, + "num_tokens": 38208794.0, + "step": 1004 + }, + { + "epoch": 0.12784632998346265, + "grad_norm": 1.6600263118743896, + "learning_rate": 4.2560406952098343e-07, + "loss": 0.5303, + "mean_token_accuracy": 0.8287382125854492, + "num_tokens": 38247920.0, + "step": 1005 + }, + { + "epoch": 0.12797354026205318, + "grad_norm": 1.5164090394973755, + "learning_rate": 4.2602797795676133e-07, + "loss": 0.4461, + "mean_token_accuracy": 0.8520482778549194, + "num_tokens": 38287508.0, + "step": 1006 + }, + { + "epoch": 0.12810075054064368, + "grad_norm": 1.6942883729934692, + "learning_rate": 4.2645188639253923e-07, + "loss": 0.4845, + "mean_token_accuracy": 0.8387021422386169, + "num_tokens": 38320305.0, + "step": 1007 + }, + { + "epoch": 0.12822796081923418, + "grad_norm": 1.696898341178894, + "learning_rate": 4.26875794828317e-07, + "loss": 0.4362, + "mean_token_accuracy": 0.8564326167106628, + "num_tokens": 38352442.0, + "step": 1008 + }, + { + "epoch": 0.1283551710978247, + "grad_norm": 1.4412845373153687, + "learning_rate": 4.272997032640949e-07, + "loss": 0.4523, + "mean_token_accuracy": 0.8532669544219971, + "num_tokens": 38396086.0, + "step": 1009 + }, + { + "epoch": 0.1284823813764152, + "grad_norm": 1.6087889671325684, + "learning_rate": 4.277236116998728e-07, + "loss": 0.4877, + "mean_token_accuracy": 0.839741587638855, + "num_tokens": 38432889.0, + "step": 1010 + }, + { + "epoch": 0.1286095916550057, + "grad_norm": 1.7059247493743896, + "learning_rate": 4.281475201356507e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.8412485718727112, + "num_tokens": 38471681.0, + "step": 1011 + }, + { + "epoch": 0.12873680193359624, + "grad_norm": 1.7439324855804443, + "learning_rate": 4.285714285714285e-07, + "loss": 0.4388, + "mean_token_accuracy": 0.8537160158157349, + "num_tokens": 38505145.0, + "step": 1012 + }, + { + "epoch": 0.12886401221218674, + "grad_norm": 1.581948161125183, + "learning_rate": 4.289953370072064e-07, + "loss": 0.4442, + "mean_token_accuracy": 0.8550155758857727, + "num_tokens": 38540012.0, + "step": 1013 + }, + { + "epoch": 0.12899122249077727, + "grad_norm": 1.596248745918274, + "learning_rate": 4.294192454429843e-07, + "loss": 0.5276, + "mean_token_accuracy": 0.8248380422592163, + "num_tokens": 38581861.0, + "step": 1014 + }, + { + "epoch": 0.12911843276936777, + "grad_norm": 1.503383994102478, + "learning_rate": 4.298431538787622e-07, + "loss": 0.5416, + "mean_token_accuracy": 0.8258275985717773, + "num_tokens": 38630809.0, + "step": 1015 + }, + { + "epoch": 0.12924564304795827, + "grad_norm": 1.6459498405456543, + "learning_rate": 4.3026706231454e-07, + "loss": 0.4491, + "mean_token_accuracy": 0.8486448526382446, + "num_tokens": 38663639.0, + "step": 1016 + }, + { + "epoch": 0.1293728533265488, + "grad_norm": 1.6501758098602295, + "learning_rate": 4.306909707503179e-07, + "loss": 0.4547, + "mean_token_accuracy": 0.8502374887466431, + "num_tokens": 38697680.0, + "step": 1017 + }, + { + "epoch": 0.1295000636051393, + "grad_norm": 1.4936234951019287, + "learning_rate": 4.311148791860958e-07, + "loss": 0.4575, + "mean_token_accuracy": 0.8508256077766418, + "num_tokens": 38736429.0, + "step": 1018 + }, + { + "epoch": 0.1296272738837298, + "grad_norm": 1.6362532377243042, + "learning_rate": 4.315387876218737e-07, + "loss": 0.5181, + "mean_token_accuracy": 0.8307664394378662, + "num_tokens": 38773634.0, + "step": 1019 + }, + { + "epoch": 0.12975448416232033, + "grad_norm": 1.6179279088974, + "learning_rate": 4.319626960576515e-07, + "loss": 0.4956, + "mean_token_accuracy": 0.8391589522361755, + "num_tokens": 38811897.0, + "step": 1020 + }, + { + "epoch": 0.12988169444091083, + "grad_norm": 1.5881836414337158, + "learning_rate": 4.323866044934294e-07, + "loss": 0.5088, + "mean_token_accuracy": 0.8313273787498474, + "num_tokens": 38849567.0, + "step": 1021 + }, + { + "epoch": 0.13000890471950133, + "grad_norm": 1.7200701236724854, + "learning_rate": 4.328105129292073e-07, + "loss": 0.5099, + "mean_token_accuracy": 0.8319663405418396, + "num_tokens": 38885428.0, + "step": 1022 + }, + { + "epoch": 0.13013611499809186, + "grad_norm": 1.5277791023254395, + "learning_rate": 4.332344213649852e-07, + "loss": 0.4117, + "mean_token_accuracy": 0.8625675439834595, + "num_tokens": 38927537.0, + "step": 1023 + }, + { + "epoch": 0.13026332527668236, + "grad_norm": 1.4601069688796997, + "learning_rate": 4.33658329800763e-07, + "loss": 0.465, + "mean_token_accuracy": 0.8478519916534424, + "num_tokens": 38971851.0, + "step": 1024 + }, + { + "epoch": 0.13039053555527286, + "grad_norm": 1.6564337015151978, + "learning_rate": 4.340822382365409e-07, + "loss": 0.5133, + "mean_token_accuracy": 0.8328343629837036, + "num_tokens": 39007367.0, + "step": 1025 + }, + { + "epoch": 0.13051774583386339, + "grad_norm": 1.6516525745391846, + "learning_rate": 4.345061466723188e-07, + "loss": 0.5366, + "mean_token_accuracy": 0.8241424560546875, + "num_tokens": 39044944.0, + "step": 1026 + }, + { + "epoch": 0.13064495611245389, + "grad_norm": 1.7337706089019775, + "learning_rate": 4.3493005510809663e-07, + "loss": 0.4789, + "mean_token_accuracy": 0.8418265581130981, + "num_tokens": 39080635.0, + "step": 1027 + }, + { + "epoch": 0.1307721663910444, + "grad_norm": 1.9510098695755005, + "learning_rate": 4.353539635438745e-07, + "loss": 0.5616, + "mean_token_accuracy": 0.8234528303146362, + "num_tokens": 39114671.0, + "step": 1028 + }, + { + "epoch": 0.13089937666963491, + "grad_norm": 1.6710193157196045, + "learning_rate": 4.357778719796524e-07, + "loss": 0.5056, + "mean_token_accuracy": 0.8378314971923828, + "num_tokens": 39149590.0, + "step": 1029 + }, + { + "epoch": 0.13102658694822542, + "grad_norm": 1.5420851707458496, + "learning_rate": 4.362017804154303e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.844633162021637, + "num_tokens": 39187414.0, + "step": 1030 + }, + { + "epoch": 0.13115379722681592, + "grad_norm": 1.6644208431243896, + "learning_rate": 4.366256888512081e-07, + "loss": 0.4602, + "mean_token_accuracy": 0.8475230932235718, + "num_tokens": 39221936.0, + "step": 1031 + }, + { + "epoch": 0.13128100750540644, + "grad_norm": 1.5970101356506348, + "learning_rate": 4.3704959728698597e-07, + "loss": 0.4257, + "mean_token_accuracy": 0.8580547571182251, + "num_tokens": 39259463.0, + "step": 1032 + }, + { + "epoch": 0.13140821778399694, + "grad_norm": 1.7629488706588745, + "learning_rate": 4.3747350572276386e-07, + "loss": 0.5409, + "mean_token_accuracy": 0.8297750949859619, + "num_tokens": 39297566.0, + "step": 1033 + }, + { + "epoch": 0.13153542806258745, + "grad_norm": 1.5521413087844849, + "learning_rate": 4.3789741415854176e-07, + "loss": 0.4192, + "mean_token_accuracy": 0.8593199849128723, + "num_tokens": 39332480.0, + "step": 1034 + }, + { + "epoch": 0.13166263834117797, + "grad_norm": 1.5626535415649414, + "learning_rate": 4.383213225943196e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.8451550006866455, + "num_tokens": 39373646.0, + "step": 1035 + }, + { + "epoch": 0.13178984861976847, + "grad_norm": 1.4524636268615723, + "learning_rate": 4.3874523103009746e-07, + "loss": 0.4656, + "mean_token_accuracy": 0.8489884734153748, + "num_tokens": 39415915.0, + "step": 1036 + }, + { + "epoch": 0.13191705889835897, + "grad_norm": 1.6202340126037598, + "learning_rate": 4.3916913946587536e-07, + "loss": 0.595, + "mean_token_accuracy": 0.8127351999282837, + "num_tokens": 39455805.0, + "step": 1037 + }, + { + "epoch": 0.1320442691769495, + "grad_norm": 1.6329444646835327, + "learning_rate": 4.3959304790165325e-07, + "loss": 0.477, + "mean_token_accuracy": 0.8401756286621094, + "num_tokens": 39493665.0, + "step": 1038 + }, + { + "epoch": 0.13217147945554, + "grad_norm": 1.551041603088379, + "learning_rate": 4.400169563374311e-07, + "loss": 0.5026, + "mean_token_accuracy": 0.8380218148231506, + "num_tokens": 39535957.0, + "step": 1039 + }, + { + "epoch": 0.13229868973413053, + "grad_norm": 1.438876986503601, + "learning_rate": 4.4044086477320895e-07, + "loss": 0.4282, + "mean_token_accuracy": 0.8570192456245422, + "num_tokens": 39578683.0, + "step": 1040 + }, + { + "epoch": 0.13242590001272103, + "grad_norm": 1.5871731042861938, + "learning_rate": 4.4086477320898685e-07, + "loss": 0.4572, + "mean_token_accuracy": 0.8485169410705566, + "num_tokens": 39616096.0, + "step": 1041 + }, + { + "epoch": 0.13255311029131153, + "grad_norm": 1.6163549423217773, + "learning_rate": 4.4128868164476474e-07, + "loss": 0.5206, + "mean_token_accuracy": 0.8284498453140259, + "num_tokens": 39656589.0, + "step": 1042 + }, + { + "epoch": 0.13268032056990206, + "grad_norm": 1.7065573930740356, + "learning_rate": 4.417125900805426e-07, + "loss": 0.5145, + "mean_token_accuracy": 0.8303922414779663, + "num_tokens": 39694490.0, + "step": 1043 + }, + { + "epoch": 0.13280753084849256, + "grad_norm": 1.5522466897964478, + "learning_rate": 4.4213649851632044e-07, + "loss": 0.4275, + "mean_token_accuracy": 0.8582793474197388, + "num_tokens": 39733909.0, + "step": 1044 + }, + { + "epoch": 0.13293474112708306, + "grad_norm": 1.494000792503357, + "learning_rate": 4.4256040695209834e-07, + "loss": 0.4269, + "mean_token_accuracy": 0.8542311787605286, + "num_tokens": 39772948.0, + "step": 1045 + }, + { + "epoch": 0.1330619514056736, + "grad_norm": 1.5862988233566284, + "learning_rate": 4.429843153878762e-07, + "loss": 0.5002, + "mean_token_accuracy": 0.8350723385810852, + "num_tokens": 39816973.0, + "step": 1046 + }, + { + "epoch": 0.1331891616842641, + "grad_norm": 1.4379624128341675, + "learning_rate": 4.434082238236541e-07, + "loss": 0.4461, + "mean_token_accuracy": 0.8521816730499268, + "num_tokens": 39859615.0, + "step": 1047 + }, + { + "epoch": 0.1333163719628546, + "grad_norm": 1.4898029565811157, + "learning_rate": 4.4383213225943193e-07, + "loss": 0.4974, + "mean_token_accuracy": 0.8366534113883972, + "num_tokens": 39904131.0, + "step": 1048 + }, + { + "epoch": 0.13344358224144512, + "grad_norm": 1.652956485748291, + "learning_rate": 4.442560406952098e-07, + "loss": 0.4898, + "mean_token_accuracy": 0.8370947241783142, + "num_tokens": 39940938.0, + "step": 1049 + }, + { + "epoch": 0.13357079252003562, + "grad_norm": 1.7136763334274292, + "learning_rate": 4.4467994913098767e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.8377411961555481, + "num_tokens": 39973646.0, + "step": 1050 + }, + { + "epoch": 0.13369800279862612, + "grad_norm": 1.6701360940933228, + "learning_rate": 4.4510385756676557e-07, + "loss": 0.5072, + "mean_token_accuracy": 0.8360684514045715, + "num_tokens": 40016130.0, + "step": 1051 + }, + { + "epoch": 0.13382521307721665, + "grad_norm": 1.5917528867721558, + "learning_rate": 4.455277660025434e-07, + "loss": 0.4454, + "mean_token_accuracy": 0.8517277836799622, + "num_tokens": 40054209.0, + "step": 1052 + }, + { + "epoch": 0.13395242335580715, + "grad_norm": 1.4916876554489136, + "learning_rate": 4.459516744383213e-07, + "loss": 0.522, + "mean_token_accuracy": 0.8250965476036072, + "num_tokens": 40096724.0, + "step": 1053 + }, + { + "epoch": 0.13407963363439765, + "grad_norm": 1.643872618675232, + "learning_rate": 4.4637558287409916e-07, + "loss": 0.4791, + "mean_token_accuracy": 0.8460716009140015, + "num_tokens": 40134397.0, + "step": 1054 + }, + { + "epoch": 0.13420684391298818, + "grad_norm": 1.586567759513855, + "learning_rate": 4.4679949130987706e-07, + "loss": 0.5024, + "mean_token_accuracy": 0.8367424607276917, + "num_tokens": 40176893.0, + "step": 1055 + }, + { + "epoch": 0.13433405419157868, + "grad_norm": 1.6132949590682983, + "learning_rate": 4.472233997456549e-07, + "loss": 0.4442, + "mean_token_accuracy": 0.8516770601272583, + "num_tokens": 40213880.0, + "step": 1056 + }, + { + "epoch": 0.13446126447016918, + "grad_norm": 1.6025782823562622, + "learning_rate": 4.476473081814328e-07, + "loss": 0.4766, + "mean_token_accuracy": 0.8404071927070618, + "num_tokens": 40258039.0, + "step": 1057 + }, + { + "epoch": 0.1345884747487597, + "grad_norm": 1.6903561353683472, + "learning_rate": 4.4807121661721065e-07, + "loss": 0.472, + "mean_token_accuracy": 0.8428691625595093, + "num_tokens": 40294277.0, + "step": 1058 + }, + { + "epoch": 0.1347156850273502, + "grad_norm": 1.5253468751907349, + "learning_rate": 4.4849512505298855e-07, + "loss": 0.5204, + "mean_token_accuracy": 0.8314832448959351, + "num_tokens": 40338335.0, + "step": 1059 + }, + { + "epoch": 0.1348428953059407, + "grad_norm": 1.5664644241333008, + "learning_rate": 4.489190334887664e-07, + "loss": 0.4466, + "mean_token_accuracy": 0.8531217575073242, + "num_tokens": 40378524.0, + "step": 1060 + }, + { + "epoch": 0.13497010558453124, + "grad_norm": 1.6067566871643066, + "learning_rate": 4.493429419245443e-07, + "loss": 0.427, + "mean_token_accuracy": 0.8559728264808655, + "num_tokens": 40414872.0, + "step": 1061 + }, + { + "epoch": 0.13509731586312174, + "grad_norm": 1.6157488822937012, + "learning_rate": 4.4976685036032214e-07, + "loss": 0.48, + "mean_token_accuracy": 0.846764326095581, + "num_tokens": 40453520.0, + "step": 1062 + }, + { + "epoch": 0.13522452614171224, + "grad_norm": 1.4392852783203125, + "learning_rate": 4.5019075879610004e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.8495798110961914, + "num_tokens": 40500126.0, + "step": 1063 + }, + { + "epoch": 0.13535173642030277, + "grad_norm": 1.4383958578109741, + "learning_rate": 4.506146672318779e-07, + "loss": 0.4503, + "mean_token_accuracy": 0.8544899225234985, + "num_tokens": 40545105.0, + "step": 1064 + }, + { + "epoch": 0.13547894669889327, + "grad_norm": 1.6011698246002197, + "learning_rate": 4.5103857566765573e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.848630428314209, + "num_tokens": 40579618.0, + "step": 1065 + }, + { + "epoch": 0.1356061569774838, + "grad_norm": 1.6106486320495605, + "learning_rate": 4.5146248410343363e-07, + "loss": 0.4538, + "mean_token_accuracy": 0.8509762287139893, + "num_tokens": 40618004.0, + "step": 1066 + }, + { + "epoch": 0.1357333672560743, + "grad_norm": 1.7330893278121948, + "learning_rate": 4.5188639253921153e-07, + "loss": 0.4922, + "mean_token_accuracy": 0.8399522304534912, + "num_tokens": 40652814.0, + "step": 1067 + }, + { + "epoch": 0.1358605775346648, + "grad_norm": 1.8540034294128418, + "learning_rate": 4.523103009749894e-07, + "loss": 0.4152, + "mean_token_accuracy": 0.8589063882827759, + "num_tokens": 40682719.0, + "step": 1068 + }, + { + "epoch": 0.13598778781325532, + "grad_norm": 1.4959360361099243, + "learning_rate": 4.527342094107672e-07, + "loss": 0.4753, + "mean_token_accuracy": 0.8444350361824036, + "num_tokens": 40727190.0, + "step": 1069 + }, + { + "epoch": 0.13611499809184582, + "grad_norm": 1.7532237768173218, + "learning_rate": 4.531581178465451e-07, + "loss": 0.5045, + "mean_token_accuracy": 0.8336838483810425, + "num_tokens": 40761696.0, + "step": 1070 + }, + { + "epoch": 0.13624220837043632, + "grad_norm": 1.6930042505264282, + "learning_rate": 4.53582026282323e-07, + "loss": 0.5588, + "mean_token_accuracy": 0.8210076093673706, + "num_tokens": 40796984.0, + "step": 1071 + }, + { + "epoch": 0.13636941864902685, + "grad_norm": 1.5982229709625244, + "learning_rate": 4.5400593471810087e-07, + "loss": 0.473, + "mean_token_accuracy": 0.8465096354484558, + "num_tokens": 40833669.0, + "step": 1072 + }, + { + "epoch": 0.13649662892761735, + "grad_norm": 1.9443072080612183, + "learning_rate": 4.544298431538787e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.841539740562439, + "num_tokens": 40867648.0, + "step": 1073 + }, + { + "epoch": 0.13662383920620785, + "grad_norm": 1.6538335084915161, + "learning_rate": 4.548537515896566e-07, + "loss": 0.4481, + "mean_token_accuracy": 0.8525973558425903, + "num_tokens": 40901259.0, + "step": 1074 + }, + { + "epoch": 0.13675104948479838, + "grad_norm": 1.5604110956192017, + "learning_rate": 4.552776600254345e-07, + "loss": 0.4422, + "mean_token_accuracy": 0.8548691272735596, + "num_tokens": 40940076.0, + "step": 1075 + }, + { + "epoch": 0.13687825976338888, + "grad_norm": 1.8087478876113892, + "learning_rate": 4.5570156846121236e-07, + "loss": 0.4606, + "mean_token_accuracy": 0.8471940755844116, + "num_tokens": 40971585.0, + "step": 1076 + }, + { + "epoch": 0.13700547004197938, + "grad_norm": 1.5534605979919434, + "learning_rate": 4.561254768969902e-07, + "loss": 0.4486, + "mean_token_accuracy": 0.8519500494003296, + "num_tokens": 41011450.0, + "step": 1077 + }, + { + "epoch": 0.1371326803205699, + "grad_norm": 1.6555414199829102, + "learning_rate": 4.565493853327681e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.8394398093223572, + "num_tokens": 41049488.0, + "step": 1078 + }, + { + "epoch": 0.1372598905991604, + "grad_norm": 1.5749906301498413, + "learning_rate": 4.56973293768546e-07, + "loss": 0.4706, + "mean_token_accuracy": 0.8444579243659973, + "num_tokens": 41085211.0, + "step": 1079 + }, + { + "epoch": 0.1373871008777509, + "grad_norm": 1.454005241394043, + "learning_rate": 4.573972022043238e-07, + "loss": 0.4814, + "mean_token_accuracy": 0.8452082872390747, + "num_tokens": 41129923.0, + "step": 1080 + }, + { + "epoch": 0.13751431115634144, + "grad_norm": 1.6425889730453491, + "learning_rate": 4.578211106401017e-07, + "loss": 0.4804, + "mean_token_accuracy": 0.8380271196365356, + "num_tokens": 41165851.0, + "step": 1081 + }, + { + "epoch": 0.13764152143493194, + "grad_norm": 1.7471425533294678, + "learning_rate": 4.582450190758796e-07, + "loss": 0.4591, + "mean_token_accuracy": 0.848797082901001, + "num_tokens": 41197131.0, + "step": 1082 + }, + { + "epoch": 0.13776873171352244, + "grad_norm": 1.4804303646087646, + "learning_rate": 4.586689275116575e-07, + "loss": 0.4636, + "mean_token_accuracy": 0.8487339019775391, + "num_tokens": 41241512.0, + "step": 1083 + }, + { + "epoch": 0.13789594199211297, + "grad_norm": 1.5327582359313965, + "learning_rate": 4.590928359474353e-07, + "loss": 0.5129, + "mean_token_accuracy": 0.8305269479751587, + "num_tokens": 41287199.0, + "step": 1084 + }, + { + "epoch": 0.13802315227070347, + "grad_norm": 1.564816951751709, + "learning_rate": 4.595167443832132e-07, + "loss": 0.5175, + "mean_token_accuracy": 0.8324395418167114, + "num_tokens": 41326548.0, + "step": 1085 + }, + { + "epoch": 0.13815036254929397, + "grad_norm": 1.715976357460022, + "learning_rate": 4.599406528189911e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.8362089991569519, + "num_tokens": 41360211.0, + "step": 1086 + }, + { + "epoch": 0.1382775728278845, + "grad_norm": 1.6978542804718018, + "learning_rate": 4.60364561254769e-07, + "loss": 0.5266, + "mean_token_accuracy": 0.8277554512023926, + "num_tokens": 41405212.0, + "step": 1087 + }, + { + "epoch": 0.138404783106475, + "grad_norm": 1.6344867944717407, + "learning_rate": 4.607884696905468e-07, + "loss": 0.4895, + "mean_token_accuracy": 0.8373505473136902, + "num_tokens": 41441818.0, + "step": 1088 + }, + { + "epoch": 0.1385319933850655, + "grad_norm": 1.6599806547164917, + "learning_rate": 4.612123781263247e-07, + "loss": 0.4156, + "mean_token_accuracy": 0.858709454536438, + "num_tokens": 41472756.0, + "step": 1089 + }, + { + "epoch": 0.13865920366365603, + "grad_norm": 1.5251438617706299, + "learning_rate": 4.616362865621026e-07, + "loss": 0.4824, + "mean_token_accuracy": 0.8423016667366028, + "num_tokens": 41517243.0, + "step": 1090 + }, + { + "epoch": 0.13878641394224653, + "grad_norm": 1.5940533876419067, + "learning_rate": 4.620601949978805e-07, + "loss": 0.4486, + "mean_token_accuracy": 0.8511151671409607, + "num_tokens": 41556223.0, + "step": 1091 + }, + { + "epoch": 0.13891362422083706, + "grad_norm": 1.5470649003982544, + "learning_rate": 4.6248410343365827e-07, + "loss": 0.4772, + "mean_token_accuracy": 0.8418904542922974, + "num_tokens": 41596729.0, + "step": 1092 + }, + { + "epoch": 0.13904083449942756, + "grad_norm": 1.5582369565963745, + "learning_rate": 4.6290801186943617e-07, + "loss": 0.4747, + "mean_token_accuracy": 0.8430610299110413, + "num_tokens": 41633390.0, + "step": 1093 + }, + { + "epoch": 0.13916804477801806, + "grad_norm": 1.454423427581787, + "learning_rate": 4.6333192030521407e-07, + "loss": 0.4672, + "mean_token_accuracy": 0.8451079726219177, + "num_tokens": 41677197.0, + "step": 1094 + }, + { + "epoch": 0.13929525505660859, + "grad_norm": 1.7197604179382324, + "learning_rate": 4.6375582874099196e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.8361746072769165, + "num_tokens": 41712101.0, + "step": 1095 + }, + { + "epoch": 0.1394224653351991, + "grad_norm": 1.540278434753418, + "learning_rate": 4.6417973717676976e-07, + "loss": 0.4603, + "mean_token_accuracy": 0.8494979739189148, + "num_tokens": 41753967.0, + "step": 1096 + }, + { + "epoch": 0.1395496756137896, + "grad_norm": 1.5823529958724976, + "learning_rate": 4.6460364561254766e-07, + "loss": 0.388, + "mean_token_accuracy": 0.8724068403244019, + "num_tokens": 41789502.0, + "step": 1097 + }, + { + "epoch": 0.13967688589238011, + "grad_norm": 1.5720515251159668, + "learning_rate": 4.6502755404832556e-07, + "loss": 0.4631, + "mean_token_accuracy": 0.8446894288063049, + "num_tokens": 41830631.0, + "step": 1098 + }, + { + "epoch": 0.13980409617097062, + "grad_norm": 1.5041428804397583, + "learning_rate": 4.654514624841034e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.8452813625335693, + "num_tokens": 41872650.0, + "step": 1099 + }, + { + "epoch": 0.13993130644956112, + "grad_norm": 1.7029454708099365, + "learning_rate": 4.6587537091988125e-07, + "loss": 0.5453, + "mean_token_accuracy": 0.8212212324142456, + "num_tokens": 41911326.0, + "step": 1100 + }, + { + "epoch": 0.14005851672815164, + "grad_norm": 1.5990829467773438, + "learning_rate": 4.6629927935565915e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.841294527053833, + "num_tokens": 41949775.0, + "step": 1101 + }, + { + "epoch": 0.14018572700674214, + "grad_norm": 1.584203839302063, + "learning_rate": 4.6672318779143705e-07, + "loss": 0.4809, + "mean_token_accuracy": 0.8404622673988342, + "num_tokens": 41989179.0, + "step": 1102 + }, + { + "epoch": 0.14031293728533265, + "grad_norm": 1.5463004112243652, + "learning_rate": 4.671470962272149e-07, + "loss": 0.481, + "mean_token_accuracy": 0.8403990864753723, + "num_tokens": 42033509.0, + "step": 1103 + }, + { + "epoch": 0.14044014756392317, + "grad_norm": 1.7010033130645752, + "learning_rate": 4.6757100466299274e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.8421645164489746, + "num_tokens": 42067005.0, + "step": 1104 + }, + { + "epoch": 0.14056735784251367, + "grad_norm": 1.7223949432373047, + "learning_rate": 4.6799491309877064e-07, + "loss": 0.4205, + "mean_token_accuracy": 0.8637940883636475, + "num_tokens": 42102078.0, + "step": 1105 + }, + { + "epoch": 0.14069456812110417, + "grad_norm": 1.5382360219955444, + "learning_rate": 4.6841882153454854e-07, + "loss": 0.4734, + "mean_token_accuracy": 0.8469454050064087, + "num_tokens": 42142192.0, + "step": 1106 + }, + { + "epoch": 0.1408217783996947, + "grad_norm": 1.5893837213516235, + "learning_rate": 4.688427299703264e-07, + "loss": 0.4442, + "mean_token_accuracy": 0.851886510848999, + "num_tokens": 42181062.0, + "step": 1107 + }, + { + "epoch": 0.1409489886782852, + "grad_norm": 1.7014155387878418, + "learning_rate": 4.6926663840610423e-07, + "loss": 0.4799, + "mean_token_accuracy": 0.8389965295791626, + "num_tokens": 42219559.0, + "step": 1108 + }, + { + "epoch": 0.1410761989568757, + "grad_norm": 1.6258167028427124, + "learning_rate": 4.6969054684188213e-07, + "loss": 0.4737, + "mean_token_accuracy": 0.8454803228378296, + "num_tokens": 42255786.0, + "step": 1109 + }, + { + "epoch": 0.14120340923546623, + "grad_norm": 1.6523381471633911, + "learning_rate": 4.7011445527766003e-07, + "loss": 0.4529, + "mean_token_accuracy": 0.8488677740097046, + "num_tokens": 42290306.0, + "step": 1110 + }, + { + "epoch": 0.14133061951405673, + "grad_norm": 1.5558488368988037, + "learning_rate": 4.7053836371343787e-07, + "loss": 0.4589, + "mean_token_accuracy": 0.8506227135658264, + "num_tokens": 42325610.0, + "step": 1111 + }, + { + "epoch": 0.14145782979264723, + "grad_norm": 1.3946679830551147, + "learning_rate": 4.709622721492157e-07, + "loss": 0.4571, + "mean_token_accuracy": 0.8504267334938049, + "num_tokens": 42369792.0, + "step": 1112 + }, + { + "epoch": 0.14158504007123776, + "grad_norm": 1.4986454248428345, + "learning_rate": 4.713861805849936e-07, + "loss": 0.4584, + "mean_token_accuracy": 0.8491696715354919, + "num_tokens": 42410480.0, + "step": 1113 + }, + { + "epoch": 0.14171225034982826, + "grad_norm": 1.542576551437378, + "learning_rate": 4.718100890207715e-07, + "loss": 0.4727, + "mean_token_accuracy": 0.8415554165840149, + "num_tokens": 42447403.0, + "step": 1114 + }, + { + "epoch": 0.1418394606284188, + "grad_norm": 1.702163577079773, + "learning_rate": 4.7223399745654936e-07, + "loss": 0.4237, + "mean_token_accuracy": 0.8580375909805298, + "num_tokens": 42483319.0, + "step": 1115 + }, + { + "epoch": 0.1419666709070093, + "grad_norm": 1.5097109079360962, + "learning_rate": 4.726579058923272e-07, + "loss": 0.4456, + "mean_token_accuracy": 0.8536049127578735, + "num_tokens": 42524520.0, + "step": 1116 + }, + { + "epoch": 0.1420938811855998, + "grad_norm": 1.5250945091247559, + "learning_rate": 4.730818143281051e-07, + "loss": 0.3842, + "mean_token_accuracy": 0.8718160390853882, + "num_tokens": 42560671.0, + "step": 1117 + }, + { + "epoch": 0.14222109146419032, + "grad_norm": 1.7583495378494263, + "learning_rate": 4.7350572276388295e-07, + "loss": 0.5021, + "mean_token_accuracy": 0.8333663940429688, + "num_tokens": 42594431.0, + "step": 1118 + }, + { + "epoch": 0.14234830174278082, + "grad_norm": 1.5049610137939453, + "learning_rate": 4.7392963119966085e-07, + "loss": 0.4442, + "mean_token_accuracy": 0.8525390028953552, + "num_tokens": 42634145.0, + "step": 1119 + }, + { + "epoch": 0.14247551202137132, + "grad_norm": 1.5108273029327393, + "learning_rate": 4.7435353963543875e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.8406903743743896, + "num_tokens": 42675449.0, + "step": 1120 + }, + { + "epoch": 0.14260272229996185, + "grad_norm": 1.6705673933029175, + "learning_rate": 4.747774480712166e-07, + "loss": 0.42, + "mean_token_accuracy": 0.858503520488739, + "num_tokens": 42707613.0, + "step": 1121 + }, + { + "epoch": 0.14272993257855235, + "grad_norm": 1.6775153875350952, + "learning_rate": 4.7520135650699444e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.8465753793716431, + "num_tokens": 42741758.0, + "step": 1122 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 1.6341829299926758, + "learning_rate": 4.7562526494277234e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8421056270599365, + "num_tokens": 42778732.0, + "step": 1123 + }, + { + "epoch": 0.14298435313573338, + "grad_norm": 1.7280768156051636, + "learning_rate": 4.7604917337855024e-07, + "loss": 0.5142, + "mean_token_accuracy": 0.8321777582168579, + "num_tokens": 42818546.0, + "step": 1124 + }, + { + "epoch": 0.14311156341432388, + "grad_norm": 1.6045565605163574, + "learning_rate": 4.764730818143281e-07, + "loss": 0.4603, + "mean_token_accuracy": 0.853611946105957, + "num_tokens": 42855312.0, + "step": 1125 + }, + { + "epoch": 0.14323877369291438, + "grad_norm": 1.6195225715637207, + "learning_rate": 4.768969902501059e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8431135416030884, + "num_tokens": 42891016.0, + "step": 1126 + }, + { + "epoch": 0.1433659839715049, + "grad_norm": 1.856946349143982, + "learning_rate": 4.773208986858838e-07, + "loss": 0.5485, + "mean_token_accuracy": 0.8222510814666748, + "num_tokens": 42923292.0, + "step": 1127 + }, + { + "epoch": 0.1434931942500954, + "grad_norm": 1.5718811750411987, + "learning_rate": 4.777448071216617e-07, + "loss": 0.4602, + "mean_token_accuracy": 0.8458420038223267, + "num_tokens": 42959523.0, + "step": 1128 + }, + { + "epoch": 0.1436204045286859, + "grad_norm": 1.6719697713851929, + "learning_rate": 4.781687155574396e-07, + "loss": 0.52, + "mean_token_accuracy": 0.8301419019699097, + "num_tokens": 42999832.0, + "step": 1129 + }, + { + "epoch": 0.14374761480727644, + "grad_norm": 1.6872591972351074, + "learning_rate": 4.785926239932175e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.8386020064353943, + "num_tokens": 43036231.0, + "step": 1130 + }, + { + "epoch": 0.14387482508586694, + "grad_norm": 1.703130841255188, + "learning_rate": 4.790165324289953e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.8385742902755737, + "num_tokens": 43073455.0, + "step": 1131 + }, + { + "epoch": 0.14400203536445744, + "grad_norm": 1.6787828207015991, + "learning_rate": 4.794404408647732e-07, + "loss": 0.4216, + "mean_token_accuracy": 0.8600859045982361, + "num_tokens": 43105132.0, + "step": 1132 + }, + { + "epoch": 0.14412924564304797, + "grad_norm": 1.682853102684021, + "learning_rate": 4.798643493005511e-07, + "loss": 0.4035, + "mean_token_accuracy": 0.8650631904602051, + "num_tokens": 43137977.0, + "step": 1133 + }, + { + "epoch": 0.14425645592163847, + "grad_norm": 1.5614004135131836, + "learning_rate": 4.80288257736329e-07, + "loss": 0.4741, + "mean_token_accuracy": 0.8460498452186584, + "num_tokens": 43180102.0, + "step": 1134 + }, + { + "epoch": 0.14438366620022897, + "grad_norm": 1.5515146255493164, + "learning_rate": 4.807121661721068e-07, + "loss": 0.4525, + "mean_token_accuracy": 0.848482608795166, + "num_tokens": 43221345.0, + "step": 1135 + }, + { + "epoch": 0.1445108764788195, + "grad_norm": 1.8219256401062012, + "learning_rate": 4.811360746078847e-07, + "loss": 0.4423, + "mean_token_accuracy": 0.8508099317550659, + "num_tokens": 43257359.0, + "step": 1136 + }, + { + "epoch": 0.14463808675741, + "grad_norm": 1.6862685680389404, + "learning_rate": 4.815599830436625e-07, + "loss": 0.5109, + "mean_token_accuracy": 0.8328640460968018, + "num_tokens": 43295790.0, + "step": 1137 + }, + { + "epoch": 0.1447652970360005, + "grad_norm": 1.418816089630127, + "learning_rate": 4.819838914794405e-07, + "loss": 0.403, + "mean_token_accuracy": 0.8647243976593018, + "num_tokens": 43336522.0, + "step": 1138 + }, + { + "epoch": 0.14489250731459102, + "grad_norm": 1.4541559219360352, + "learning_rate": 4.824077999152183e-07, + "loss": 0.4791, + "mean_token_accuracy": 0.8422538042068481, + "num_tokens": 43381943.0, + "step": 1139 + }, + { + "epoch": 0.14501971759318152, + "grad_norm": 1.5924129486083984, + "learning_rate": 4.828317083509962e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.8386532068252563, + "num_tokens": 43418173.0, + "step": 1140 + }, + { + "epoch": 0.14514692787177205, + "grad_norm": 1.7304208278656006, + "learning_rate": 4.83255616786774e-07, + "loss": 0.6065, + "mean_token_accuracy": 0.8054983019828796, + "num_tokens": 43456925.0, + "step": 1141 + }, + { + "epoch": 0.14527413815036255, + "grad_norm": 1.4250320196151733, + "learning_rate": 4.83679525222552e-07, + "loss": 0.48, + "mean_token_accuracy": 0.8422673940658569, + "num_tokens": 43502212.0, + "step": 1142 + }, + { + "epoch": 0.14540134842895305, + "grad_norm": 1.4740862846374512, + "learning_rate": 4.841034336583298e-07, + "loss": 0.4736, + "mean_token_accuracy": 0.8440885543823242, + "num_tokens": 43543815.0, + "step": 1143 + }, + { + "epoch": 0.14552855870754358, + "grad_norm": 1.6738882064819336, + "learning_rate": 4.845273420941076e-07, + "loss": 0.5161, + "mean_token_accuracy": 0.8324816226959229, + "num_tokens": 43580168.0, + "step": 1144 + }, + { + "epoch": 0.14565576898613408, + "grad_norm": 1.4494787454605103, + "learning_rate": 4.849512505298855e-07, + "loss": 0.417, + "mean_token_accuracy": 0.8609315156936646, + "num_tokens": 43620674.0, + "step": 1145 + }, + { + "epoch": 0.14578297926472458, + "grad_norm": 1.672745704650879, + "learning_rate": 4.853751589656634e-07, + "loss": 0.4641, + "mean_token_accuracy": 0.8521871566772461, + "num_tokens": 43654847.0, + "step": 1146 + }, + { + "epoch": 0.1459101895433151, + "grad_norm": 1.5582953691482544, + "learning_rate": 4.857990674014413e-07, + "loss": 0.4435, + "mean_token_accuracy": 0.8548429012298584, + "num_tokens": 43693980.0, + "step": 1147 + }, + { + "epoch": 0.1460373998219056, + "grad_norm": 1.7628177404403687, + "learning_rate": 4.862229758372191e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.8435103893280029, + "num_tokens": 43725405.0, + "step": 1148 + }, + { + "epoch": 0.1461646101004961, + "grad_norm": 1.6591236591339111, + "learning_rate": 4.86646884272997e-07, + "loss": 0.4565, + "mean_token_accuracy": 0.8530375957489014, + "num_tokens": 43763069.0, + "step": 1149 + }, + { + "epoch": 0.14629182037908664, + "grad_norm": 1.6962460279464722, + "learning_rate": 4.870707927087749e-07, + "loss": 0.4195, + "mean_token_accuracy": 0.8597191572189331, + "num_tokens": 43795320.0, + "step": 1150 + }, + { + "epoch": 0.14641903065767714, + "grad_norm": 1.5237675905227661, + "learning_rate": 4.874947011445528e-07, + "loss": 0.4329, + "mean_token_accuracy": 0.8567900657653809, + "num_tokens": 43835306.0, + "step": 1151 + }, + { + "epoch": 0.14654624093626764, + "grad_norm": 1.4447951316833496, + "learning_rate": 4.879186095803306e-07, + "loss": 0.4772, + "mean_token_accuracy": 0.8439793586730957, + "num_tokens": 43877798.0, + "step": 1152 + }, + { + "epoch": 0.14667345121485817, + "grad_norm": 1.5148372650146484, + "learning_rate": 4.883425180161085e-07, + "loss": 0.458, + "mean_token_accuracy": 0.8493329286575317, + "num_tokens": 43914927.0, + "step": 1153 + }, + { + "epoch": 0.14680066149344867, + "grad_norm": 1.6049575805664062, + "learning_rate": 4.887664264518864e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.8463026881217957, + "num_tokens": 43953475.0, + "step": 1154 + }, + { + "epoch": 0.14692787177203917, + "grad_norm": 1.5526281595230103, + "learning_rate": 4.891903348876643e-07, + "loss": 0.4717, + "mean_token_accuracy": 0.8465713858604431, + "num_tokens": 43991506.0, + "step": 1155 + }, + { + "epoch": 0.1470550820506297, + "grad_norm": 1.5027128458023071, + "learning_rate": 4.896142433234421e-07, + "loss": 0.4366, + "mean_token_accuracy": 0.8510992527008057, + "num_tokens": 44028821.0, + "step": 1156 + }, + { + "epoch": 0.1471822923292202, + "grad_norm": 1.6036856174468994, + "learning_rate": 4.9003815175922e-07, + "loss": 0.4501, + "mean_token_accuracy": 0.8490606546401978, + "num_tokens": 44063497.0, + "step": 1157 + }, + { + "epoch": 0.1473095026078107, + "grad_norm": 1.566630244255066, + "learning_rate": 4.904620601949979e-07, + "loss": 0.5203, + "mean_token_accuracy": 0.8270604610443115, + "num_tokens": 44104772.0, + "step": 1158 + }, + { + "epoch": 0.14743671288640123, + "grad_norm": 1.5229851007461548, + "learning_rate": 4.908859686307758e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8469647169113159, + "num_tokens": 44144138.0, + "step": 1159 + }, + { + "epoch": 0.14756392316499173, + "grad_norm": 1.5291565656661987, + "learning_rate": 4.913098770665536e-07, + "loss": 0.4579, + "mean_token_accuracy": 0.8469446897506714, + "num_tokens": 44185099.0, + "step": 1160 + }, + { + "epoch": 0.14769113344358223, + "grad_norm": 1.571520209312439, + "learning_rate": 4.917337855023314e-07, + "loss": 0.4368, + "mean_token_accuracy": 0.8545222282409668, + "num_tokens": 44223439.0, + "step": 1161 + }, + { + "epoch": 0.14781834372217276, + "grad_norm": 1.636317253112793, + "learning_rate": 4.921576939381094e-07, + "loss": 0.4324, + "mean_token_accuracy": 0.8518756628036499, + "num_tokens": 44257661.0, + "step": 1162 + }, + { + "epoch": 0.14794555400076326, + "grad_norm": 1.7791409492492676, + "learning_rate": 4.925816023738872e-07, + "loss": 0.4911, + "mean_token_accuracy": 0.8330920338630676, + "num_tokens": 44293683.0, + "step": 1163 + }, + { + "epoch": 0.14807276427935376, + "grad_norm": 1.7097947597503662, + "learning_rate": 4.930055108096651e-07, + "loss": 0.4338, + "mean_token_accuracy": 0.8555994033813477, + "num_tokens": 44325993.0, + "step": 1164 + }, + { + "epoch": 0.1481999745579443, + "grad_norm": 1.6544667482376099, + "learning_rate": 4.934294192454429e-07, + "loss": 0.4431, + "mean_token_accuracy": 0.8524643182754517, + "num_tokens": 44362116.0, + "step": 1165 + }, + { + "epoch": 0.1483271848365348, + "grad_norm": 1.630035161972046, + "learning_rate": 4.938533276812209e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.8419190049171448, + "num_tokens": 44400271.0, + "step": 1166 + }, + { + "epoch": 0.14845439511512531, + "grad_norm": 1.540114164352417, + "learning_rate": 4.942772361169987e-07, + "loss": 0.4579, + "mean_token_accuracy": 0.846704363822937, + "num_tokens": 44444728.0, + "step": 1167 + }, + { + "epoch": 0.14858160539371582, + "grad_norm": 1.574611783027649, + "learning_rate": 4.947011445527766e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8419573307037354, + "num_tokens": 44486953.0, + "step": 1168 + }, + { + "epoch": 0.14870881567230632, + "grad_norm": 1.5389398336410522, + "learning_rate": 4.951250529885544e-07, + "loss": 0.4131, + "mean_token_accuracy": 0.8607267737388611, + "num_tokens": 44523585.0, + "step": 1169 + }, + { + "epoch": 0.14883602595089684, + "grad_norm": 1.5011250972747803, + "learning_rate": 4.955489614243324e-07, + "loss": 0.4951, + "mean_token_accuracy": 0.8385971784591675, + "num_tokens": 44565822.0, + "step": 1170 + }, + { + "epoch": 0.14896323622948734, + "grad_norm": 1.6443963050842285, + "learning_rate": 4.959728698601102e-07, + "loss": 0.4809, + "mean_token_accuracy": 0.8423082232475281, + "num_tokens": 44603512.0, + "step": 1171 + }, + { + "epoch": 0.14909044650807785, + "grad_norm": 1.6149342060089111, + "learning_rate": 4.963967782958881e-07, + "loss": 0.436, + "mean_token_accuracy": 0.8538112044334412, + "num_tokens": 44641908.0, + "step": 1172 + }, + { + "epoch": 0.14921765678666837, + "grad_norm": 1.9122592210769653, + "learning_rate": 4.968206867316659e-07, + "loss": 0.5085, + "mean_token_accuracy": 0.8311139941215515, + "num_tokens": 44673413.0, + "step": 1173 + }, + { + "epoch": 0.14934486706525887, + "grad_norm": 1.5836786031723022, + "learning_rate": 4.972445951674439e-07, + "loss": 0.494, + "mean_token_accuracy": 0.8398367166519165, + "num_tokens": 44711948.0, + "step": 1174 + }, + { + "epoch": 0.14947207734384937, + "grad_norm": 1.7011138200759888, + "learning_rate": 4.976685036032216e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.8379660248756409, + "num_tokens": 44748494.0, + "step": 1175 + }, + { + "epoch": 0.1495992876224399, + "grad_norm": 1.4757936000823975, + "learning_rate": 4.980924120389996e-07, + "loss": 0.4481, + "mean_token_accuracy": 0.8515806198120117, + "num_tokens": 44790881.0, + "step": 1176 + }, + { + "epoch": 0.1497264979010304, + "grad_norm": 1.6779651641845703, + "learning_rate": 4.985163204747774e-07, + "loss": 0.5061, + "mean_token_accuracy": 0.8322539329528809, + "num_tokens": 44827538.0, + "step": 1177 + }, + { + "epoch": 0.1498537081796209, + "grad_norm": 1.5699964761734009, + "learning_rate": 4.989402289105554e-07, + "loss": 0.5156, + "mean_token_accuracy": 0.8328361511230469, + "num_tokens": 44869528.0, + "step": 1178 + }, + { + "epoch": 0.14998091845821143, + "grad_norm": 1.4415535926818848, + "learning_rate": 4.993641373463331e-07, + "loss": 0.4103, + "mean_token_accuracy": 0.8605619072914124, + "num_tokens": 44908736.0, + "step": 1179 + }, + { + "epoch": 0.15010812873680193, + "grad_norm": 1.4968706369400024, + "learning_rate": 4.997880457821111e-07, + "loss": 0.4299, + "mean_token_accuracy": 0.856303334236145, + "num_tokens": 44947620.0, + "step": 1180 + }, + { + "epoch": 0.15023533901539243, + "grad_norm": 1.505009412765503, + "learning_rate": 5.002119542178889e-07, + "loss": 0.454, + "mean_token_accuracy": 0.852583646774292, + "num_tokens": 44989689.0, + "step": 1181 + }, + { + "epoch": 0.15036254929398296, + "grad_norm": 1.4968218803405762, + "learning_rate": 5.006358626536667e-07, + "loss": 0.4532, + "mean_token_accuracy": 0.8500593900680542, + "num_tokens": 45029147.0, + "step": 1182 + }, + { + "epoch": 0.15048975957257346, + "grad_norm": 1.6406139135360718, + "learning_rate": 5.010597710894446e-07, + "loss": 0.4455, + "mean_token_accuracy": 0.8502539396286011, + "num_tokens": 45062752.0, + "step": 1183 + }, + { + "epoch": 0.15061696985116396, + "grad_norm": 1.5426989793777466, + "learning_rate": 5.014836795252225e-07, + "loss": 0.4532, + "mean_token_accuracy": 0.8488227725028992, + "num_tokens": 45101974.0, + "step": 1184 + }, + { + "epoch": 0.1507441801297545, + "grad_norm": 1.58355712890625, + "learning_rate": 5.019075879610004e-07, + "loss": 0.4285, + "mean_token_accuracy": 0.8611252307891846, + "num_tokens": 45138069.0, + "step": 1185 + }, + { + "epoch": 0.150871390408345, + "grad_norm": 1.5113681554794312, + "learning_rate": 5.023314963967783e-07, + "loss": 0.4309, + "mean_token_accuracy": 0.8537026643753052, + "num_tokens": 45176755.0, + "step": 1186 + }, + { + "epoch": 0.1509986006869355, + "grad_norm": 1.522365927696228, + "learning_rate": 5.027554048325562e-07, + "loss": 0.4107, + "mean_token_accuracy": 0.8633196353912354, + "num_tokens": 45211527.0, + "step": 1187 + }, + { + "epoch": 0.15112581096552602, + "grad_norm": 1.6108193397521973, + "learning_rate": 5.03179313268334e-07, + "loss": 0.4774, + "mean_token_accuracy": 0.8424690365791321, + "num_tokens": 45248248.0, + "step": 1188 + }, + { + "epoch": 0.15125302124411652, + "grad_norm": 1.644681692123413, + "learning_rate": 5.036032217041119e-07, + "loss": 0.454, + "mean_token_accuracy": 0.8514435887336731, + "num_tokens": 45284732.0, + "step": 1189 + }, + { + "epoch": 0.15138023152270705, + "grad_norm": 1.5298032760620117, + "learning_rate": 5.040271301398897e-07, + "loss": 0.4288, + "mean_token_accuracy": 0.8568545579910278, + "num_tokens": 45321720.0, + "step": 1190 + }, + { + "epoch": 0.15150744180129755, + "grad_norm": 1.4824910163879395, + "learning_rate": 5.044510385756676e-07, + "loss": 0.4091, + "mean_token_accuracy": 0.8639727234840393, + "num_tokens": 45360330.0, + "step": 1191 + }, + { + "epoch": 0.15163465207988805, + "grad_norm": 1.408900260925293, + "learning_rate": 5.048749470114455e-07, + "loss": 0.4135, + "mean_token_accuracy": 0.8639541864395142, + "num_tokens": 45401809.0, + "step": 1192 + }, + { + "epoch": 0.15176186235847858, + "grad_norm": 1.5049539804458618, + "learning_rate": 5.052988554472234e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.8357868194580078, + "num_tokens": 45447005.0, + "step": 1193 + }, + { + "epoch": 0.15188907263706908, + "grad_norm": 1.500915765762329, + "learning_rate": 5.057227638830013e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.8474393486976624, + "num_tokens": 45488807.0, + "step": 1194 + }, + { + "epoch": 0.15201628291565958, + "grad_norm": 1.5394841432571411, + "learning_rate": 5.061466723187792e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8426735997200012, + "num_tokens": 45529261.0, + "step": 1195 + }, + { + "epoch": 0.1521434931942501, + "grad_norm": 1.5497874021530151, + "learning_rate": 5.065705807545569e-07, + "loss": 0.4528, + "mean_token_accuracy": 0.8517215847969055, + "num_tokens": 45571054.0, + "step": 1196 + }, + { + "epoch": 0.1522707034728406, + "grad_norm": 1.6108930110931396, + "learning_rate": 5.069944891903349e-07, + "loss": 0.4746, + "mean_token_accuracy": 0.8429245948791504, + "num_tokens": 45608425.0, + "step": 1197 + }, + { + "epoch": 0.1523979137514311, + "grad_norm": 1.643618106842041, + "learning_rate": 5.074183976261127e-07, + "loss": 0.5, + "mean_token_accuracy": 0.837896466255188, + "num_tokens": 45647963.0, + "step": 1198 + }, + { + "epoch": 0.15252512403002164, + "grad_norm": 1.6187607049942017, + "learning_rate": 5.078423060618906e-07, + "loss": 0.4561, + "mean_token_accuracy": 0.8487112522125244, + "num_tokens": 45684546.0, + "step": 1199 + }, + { + "epoch": 0.15265233430861214, + "grad_norm": 1.625872015953064, + "learning_rate": 5.082662144976685e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.8415115475654602, + "num_tokens": 45722721.0, + "step": 1200 + }, + { + "epoch": 0.15277954458720264, + "grad_norm": 1.5687212944030762, + "learning_rate": 5.086901229334464e-07, + "loss": 0.421, + "mean_token_accuracy": 0.8601759672164917, + "num_tokens": 45761693.0, + "step": 1201 + }, + { + "epoch": 0.15290675486579317, + "grad_norm": 1.6644434928894043, + "learning_rate": 5.091140313692243e-07, + "loss": 0.4987, + "mean_token_accuracy": 0.837539553642273, + "num_tokens": 45795535.0, + "step": 1202 + }, + { + "epoch": 0.15303396514438367, + "grad_norm": 1.4865672588348389, + "learning_rate": 5.095379398050022e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.8364166617393494, + "num_tokens": 45837988.0, + "step": 1203 + }, + { + "epoch": 0.15316117542297417, + "grad_norm": 1.5257108211517334, + "learning_rate": 5.099618482407799e-07, + "loss": 0.4045, + "mean_token_accuracy": 0.865264892578125, + "num_tokens": 45877272.0, + "step": 1204 + }, + { + "epoch": 0.1532883857015647, + "grad_norm": 1.6883444786071777, + "learning_rate": 5.103857566765578e-07, + "loss": 0.4877, + "mean_token_accuracy": 0.8402553200721741, + "num_tokens": 45912200.0, + "step": 1205 + }, + { + "epoch": 0.1534155959801552, + "grad_norm": 1.590345025062561, + "learning_rate": 5.108096651123357e-07, + "loss": 0.4825, + "mean_token_accuracy": 0.8422354459762573, + "num_tokens": 45953093.0, + "step": 1206 + }, + { + "epoch": 0.1535428062587457, + "grad_norm": 1.632829189300537, + "learning_rate": 5.112335735481135e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8426948189735413, + "num_tokens": 45990389.0, + "step": 1207 + }, + { + "epoch": 0.15367001653733622, + "grad_norm": 1.5031671524047852, + "learning_rate": 5.116574819838915e-07, + "loss": 0.4015, + "mean_token_accuracy": 0.8677135109901428, + "num_tokens": 46027196.0, + "step": 1208 + }, + { + "epoch": 0.15379722681592672, + "grad_norm": 1.6799014806747437, + "learning_rate": 5.120813904196693e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8421529531478882, + "num_tokens": 46061385.0, + "step": 1209 + }, + { + "epoch": 0.15392443709451722, + "grad_norm": 1.7833508253097534, + "learning_rate": 5.125052988554473e-07, + "loss": 0.4471, + "mean_token_accuracy": 0.8509244918823242, + "num_tokens": 46095234.0, + "step": 1210 + }, + { + "epoch": 0.15405164737310775, + "grad_norm": 1.5717709064483643, + "learning_rate": 5.12929207291225e-07, + "loss": 0.5108, + "mean_token_accuracy": 0.828472912311554, + "num_tokens": 46133764.0, + "step": 1211 + }, + { + "epoch": 0.15417885765169825, + "grad_norm": 1.8028056621551514, + "learning_rate": 5.133531157270029e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8376684188842773, + "num_tokens": 46163714.0, + "step": 1212 + }, + { + "epoch": 0.15430606793028875, + "grad_norm": 1.6374987363815308, + "learning_rate": 5.137770241627808e-07, + "loss": 0.4637, + "mean_token_accuracy": 0.8435662984848022, + "num_tokens": 46198752.0, + "step": 1213 + }, + { + "epoch": 0.15443327820887928, + "grad_norm": 1.489253044128418, + "learning_rate": 5.142009325985587e-07, + "loss": 0.4222, + "mean_token_accuracy": 0.8581722974777222, + "num_tokens": 46239585.0, + "step": 1214 + }, + { + "epoch": 0.15456048848746978, + "grad_norm": 1.5436115264892578, + "learning_rate": 5.146248410343365e-07, + "loss": 0.4244, + "mean_token_accuracy": 0.8549904227256775, + "num_tokens": 46274174.0, + "step": 1215 + }, + { + "epoch": 0.1546876987660603, + "grad_norm": 1.5440783500671387, + "learning_rate": 5.150487494701145e-07, + "loss": 0.492, + "mean_token_accuracy": 0.8380799293518066, + "num_tokens": 46315335.0, + "step": 1216 + }, + { + "epoch": 0.1548149090446508, + "grad_norm": 1.552417278289795, + "learning_rate": 5.154726579058923e-07, + "loss": 0.4592, + "mean_token_accuracy": 0.8515313863754272, + "num_tokens": 46356394.0, + "step": 1217 + }, + { + "epoch": 0.1549421193232413, + "grad_norm": 1.7554328441619873, + "learning_rate": 5.158965663416703e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.8406097292900085, + "num_tokens": 46389488.0, + "step": 1218 + }, + { + "epoch": 0.15506932960183184, + "grad_norm": 1.5846729278564453, + "learning_rate": 5.16320474777448e-07, + "loss": 0.4458, + "mean_token_accuracy": 0.8489116430282593, + "num_tokens": 46431362.0, + "step": 1219 + }, + { + "epoch": 0.15519653988042234, + "grad_norm": 1.5666695833206177, + "learning_rate": 5.167443832132259e-07, + "loss": 0.4524, + "mean_token_accuracy": 0.8499902486801147, + "num_tokens": 46471450.0, + "step": 1220 + }, + { + "epoch": 0.15532375015901284, + "grad_norm": 1.4992103576660156, + "learning_rate": 5.171682916490038e-07, + "loss": 0.4721, + "mean_token_accuracy": 0.8435684442520142, + "num_tokens": 46511675.0, + "step": 1221 + }, + { + "epoch": 0.15545096043760337, + "grad_norm": 1.4694124460220337, + "learning_rate": 5.175922000847816e-07, + "loss": 0.4384, + "mean_token_accuracy": 0.8588899374008179, + "num_tokens": 46551982.0, + "step": 1222 + }, + { + "epoch": 0.15557817071619387, + "grad_norm": 1.566657304763794, + "learning_rate": 5.180161085205595e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8371236324310303, + "num_tokens": 46592738.0, + "step": 1223 + }, + { + "epoch": 0.15570538099478437, + "grad_norm": 1.4049696922302246, + "learning_rate": 5.184400169563374e-07, + "loss": 0.4355, + "mean_token_accuracy": 0.8548739552497864, + "num_tokens": 46637240.0, + "step": 1224 + }, + { + "epoch": 0.1558325912733749, + "grad_norm": 1.5310605764389038, + "learning_rate": 5.188639253921153e-07, + "loss": 0.4411, + "mean_token_accuracy": 0.8577015399932861, + "num_tokens": 46675699.0, + "step": 1225 + }, + { + "epoch": 0.1559598015519654, + "grad_norm": 1.4892271757125854, + "learning_rate": 5.192878338278932e-07, + "loss": 0.5071, + "mean_token_accuracy": 0.8354511260986328, + "num_tokens": 46719314.0, + "step": 1226 + }, + { + "epoch": 0.1560870118305559, + "grad_norm": 1.6005926132202148, + "learning_rate": 5.19711742263671e-07, + "loss": 0.4194, + "mean_token_accuracy": 0.8616669774055481, + "num_tokens": 46754512.0, + "step": 1227 + }, + { + "epoch": 0.15621422210914643, + "grad_norm": 1.529964804649353, + "learning_rate": 5.201356506994488e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.8420358300209045, + "num_tokens": 46792197.0, + "step": 1228 + }, + { + "epoch": 0.15634143238773693, + "grad_norm": 1.5142720937728882, + "learning_rate": 5.205595591352268e-07, + "loss": 0.4099, + "mean_token_accuracy": 0.8624786138534546, + "num_tokens": 46828318.0, + "step": 1229 + }, + { + "epoch": 0.15646864266632743, + "grad_norm": 1.6186461448669434, + "learning_rate": 5.209834675710046e-07, + "loss": 0.4115, + "mean_token_accuracy": 0.8592201471328735, + "num_tokens": 46860917.0, + "step": 1230 + }, + { + "epoch": 0.15659585294491796, + "grad_norm": 1.4766589403152466, + "learning_rate": 5.214073760067825e-07, + "loss": 0.4627, + "mean_token_accuracy": 0.846168577671051, + "num_tokens": 46904364.0, + "step": 1231 + }, + { + "epoch": 0.15672306322350846, + "grad_norm": 1.7243067026138306, + "learning_rate": 5.218312844425604e-07, + "loss": 0.3942, + "mean_token_accuracy": 0.8683539628982544, + "num_tokens": 46936586.0, + "step": 1232 + }, + { + "epoch": 0.15685027350209896, + "grad_norm": 1.6442253589630127, + "learning_rate": 5.222551928783383e-07, + "loss": 0.4381, + "mean_token_accuracy": 0.8557109832763672, + "num_tokens": 46971713.0, + "step": 1233 + }, + { + "epoch": 0.1569774837806895, + "grad_norm": 1.739640474319458, + "learning_rate": 5.226791013141161e-07, + "loss": 0.4411, + "mean_token_accuracy": 0.8545205593109131, + "num_tokens": 47005518.0, + "step": 1234 + }, + { + "epoch": 0.15710469405928, + "grad_norm": 1.5996134281158447, + "learning_rate": 5.23103009749894e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.8465478420257568, + "num_tokens": 47045268.0, + "step": 1235 + }, + { + "epoch": 0.1572319043378705, + "grad_norm": 1.7659937143325806, + "learning_rate": 5.235269181856718e-07, + "loss": 0.4977, + "mean_token_accuracy": 0.8364448547363281, + "num_tokens": 47078928.0, + "step": 1236 + }, + { + "epoch": 0.15735911461646102, + "grad_norm": 1.6795791387557983, + "learning_rate": 5.239508266214498e-07, + "loss": 0.4277, + "mean_token_accuracy": 0.8600435256958008, + "num_tokens": 47114933.0, + "step": 1237 + }, + { + "epoch": 0.15748632489505152, + "grad_norm": 1.5396231412887573, + "learning_rate": 5.243747350572276e-07, + "loss": 0.5207, + "mean_token_accuracy": 0.8286290168762207, + "num_tokens": 47156926.0, + "step": 1238 + }, + { + "epoch": 0.15761353517364202, + "grad_norm": 1.539711594581604, + "learning_rate": 5.247986434930056e-07, + "loss": 0.4157, + "mean_token_accuracy": 0.862561821937561, + "num_tokens": 47195963.0, + "step": 1239 + }, + { + "epoch": 0.15774074545223254, + "grad_norm": 1.5893526077270508, + "learning_rate": 5.252225519287834e-07, + "loss": 0.4586, + "mean_token_accuracy": 0.846623957157135, + "num_tokens": 47233000.0, + "step": 1240 + }, + { + "epoch": 0.15786795573082305, + "grad_norm": 1.5230224132537842, + "learning_rate": 5.256464603645613e-07, + "loss": 0.407, + "mean_token_accuracy": 0.8611335158348083, + "num_tokens": 47269263.0, + "step": 1241 + }, + { + "epoch": 0.15799516600941357, + "grad_norm": 1.6407244205474854, + "learning_rate": 5.260703688003391e-07, + "loss": 0.4587, + "mean_token_accuracy": 0.8497375249862671, + "num_tokens": 47310560.0, + "step": 1242 + }, + { + "epoch": 0.15812237628800407, + "grad_norm": 1.6461044549942017, + "learning_rate": 5.26494277236117e-07, + "loss": 0.4083, + "mean_token_accuracy": 0.8636578321456909, + "num_tokens": 47347742.0, + "step": 1243 + }, + { + "epoch": 0.15824958656659457, + "grad_norm": 1.4791802167892456, + "learning_rate": 5.269181856718948e-07, + "loss": 0.4297, + "mean_token_accuracy": 0.8587679266929626, + "num_tokens": 47390645.0, + "step": 1244 + }, + { + "epoch": 0.1583767968451851, + "grad_norm": 1.6095613241195679, + "learning_rate": 5.273420941076727e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.8458697199821472, + "num_tokens": 47430853.0, + "step": 1245 + }, + { + "epoch": 0.1585040071237756, + "grad_norm": 1.5604159832000732, + "learning_rate": 5.277660025434506e-07, + "loss": 0.4528, + "mean_token_accuracy": 0.8509182929992676, + "num_tokens": 47469432.0, + "step": 1246 + }, + { + "epoch": 0.1586312174023661, + "grad_norm": 1.6245635747909546, + "learning_rate": 5.281899109792285e-07, + "loss": 0.4962, + "mean_token_accuracy": 0.8365466594696045, + "num_tokens": 47507117.0, + "step": 1247 + }, + { + "epoch": 0.15875842768095663, + "grad_norm": 1.5500929355621338, + "learning_rate": 5.286138194150064e-07, + "loss": 0.4373, + "mean_token_accuracy": 0.8558433055877686, + "num_tokens": 47547450.0, + "step": 1248 + }, + { + "epoch": 0.15888563795954713, + "grad_norm": 1.4547789096832275, + "learning_rate": 5.290377278507841e-07, + "loss": 0.454, + "mean_token_accuracy": 0.8502172231674194, + "num_tokens": 47588288.0, + "step": 1249 + }, + { + "epoch": 0.15901284823813763, + "grad_norm": 1.4736437797546387, + "learning_rate": 5.294616362865621e-07, + "loss": 0.4412, + "mean_token_accuracy": 0.8552864193916321, + "num_tokens": 47632499.0, + "step": 1250 + }, + { + "epoch": 0.15914005851672816, + "grad_norm": 1.6318484544754028, + "learning_rate": 5.298855447223399e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.8351198434829712, + "num_tokens": 47671232.0, + "step": 1251 + }, + { + "epoch": 0.15926726879531866, + "grad_norm": 1.6291234493255615, + "learning_rate": 5.303094531581178e-07, + "loss": 0.5504, + "mean_token_accuracy": 0.8189464807510376, + "num_tokens": 47713808.0, + "step": 1252 + }, + { + "epoch": 0.15939447907390916, + "grad_norm": 1.666581630706787, + "learning_rate": 5.307333615938957e-07, + "loss": 0.4527, + "mean_token_accuracy": 0.8466255068778992, + "num_tokens": 47747423.0, + "step": 1253 + }, + { + "epoch": 0.1595216893524997, + "grad_norm": 1.5855437517166138, + "learning_rate": 5.311572700296736e-07, + "loss": 0.4399, + "mean_token_accuracy": 0.85361647605896, + "num_tokens": 47783741.0, + "step": 1254 + }, + { + "epoch": 0.1596488996310902, + "grad_norm": 1.5674110651016235, + "learning_rate": 5.315811784654515e-07, + "loss": 0.4435, + "mean_token_accuracy": 0.8531979322433472, + "num_tokens": 47821202.0, + "step": 1255 + }, + { + "epoch": 0.1597761099096807, + "grad_norm": 1.606416940689087, + "learning_rate": 5.320050869012294e-07, + "loss": 0.4703, + "mean_token_accuracy": 0.8446100354194641, + "num_tokens": 47859744.0, + "step": 1256 + }, + { + "epoch": 0.15990332018827122, + "grad_norm": 1.6118453741073608, + "learning_rate": 5.324289953370071e-07, + "loss": 0.494, + "mean_token_accuracy": 0.8398492336273193, + "num_tokens": 47897123.0, + "step": 1257 + }, + { + "epoch": 0.16003053046686172, + "grad_norm": 1.5668450593948364, + "learning_rate": 5.328529037727851e-07, + "loss": 0.434, + "mean_token_accuracy": 0.8553528785705566, + "num_tokens": 47935049.0, + "step": 1258 + }, + { + "epoch": 0.16015774074545222, + "grad_norm": 1.5344934463500977, + "learning_rate": 5.332768122085629e-07, + "loss": 0.4198, + "mean_token_accuracy": 0.85892653465271, + "num_tokens": 47971304.0, + "step": 1259 + }, + { + "epoch": 0.16028495102404275, + "grad_norm": 1.4551101922988892, + "learning_rate": 5.337007206443408e-07, + "loss": 0.4074, + "mean_token_accuracy": 0.8607864379882812, + "num_tokens": 48010714.0, + "step": 1260 + }, + { + "epoch": 0.16041216130263325, + "grad_norm": 1.778578758239746, + "learning_rate": 5.341246290801187e-07, + "loss": 0.4531, + "mean_token_accuracy": 0.8493832349777222, + "num_tokens": 48042661.0, + "step": 1261 + }, + { + "epoch": 0.16053937158122375, + "grad_norm": 1.8553657531738281, + "learning_rate": 5.345485375158966e-07, + "loss": 0.589, + "mean_token_accuracy": 0.8175475597381592, + "num_tokens": 48073008.0, + "step": 1262 + }, + { + "epoch": 0.16066658185981428, + "grad_norm": 1.6067380905151367, + "learning_rate": 5.349724459516745e-07, + "loss": 0.4386, + "mean_token_accuracy": 0.8551291227340698, + "num_tokens": 48106817.0, + "step": 1263 + }, + { + "epoch": 0.16079379213840478, + "grad_norm": 1.5716344118118286, + "learning_rate": 5.353963543874522e-07, + "loss": 0.4127, + "mean_token_accuracy": 0.8631954789161682, + "num_tokens": 48142126.0, + "step": 1264 + }, + { + "epoch": 0.1609210024169953, + "grad_norm": 1.5199568271636963, + "learning_rate": 5.358202628232301e-07, + "loss": 0.5137, + "mean_token_accuracy": 0.8321779370307922, + "num_tokens": 48185143.0, + "step": 1265 + }, + { + "epoch": 0.1610482126955858, + "grad_norm": 1.4444085359573364, + "learning_rate": 5.36244171259008e-07, + "loss": 0.4371, + "mean_token_accuracy": 0.8548725843429565, + "num_tokens": 48226936.0, + "step": 1266 + }, + { + "epoch": 0.1611754229741763, + "grad_norm": 1.5903112888336182, + "learning_rate": 5.366680796947859e-07, + "loss": 0.4549, + "mean_token_accuracy": 0.8485409021377563, + "num_tokens": 48263902.0, + "step": 1267 + }, + { + "epoch": 0.16130263325276684, + "grad_norm": 1.5017515420913696, + "learning_rate": 5.370919881305637e-07, + "loss": 0.4378, + "mean_token_accuracy": 0.8489616513252258, + "num_tokens": 48302261.0, + "step": 1268 + }, + { + "epoch": 0.16142984353135734, + "grad_norm": 1.5979881286621094, + "learning_rate": 5.375158965663417e-07, + "loss": 0.4706, + "mean_token_accuracy": 0.8456190824508667, + "num_tokens": 48342338.0, + "step": 1269 + }, + { + "epoch": 0.16155705380994784, + "grad_norm": 1.579611897468567, + "learning_rate": 5.379398050021195e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8449782133102417, + "num_tokens": 48378646.0, + "step": 1270 + }, + { + "epoch": 0.16168426408853837, + "grad_norm": 1.461263656616211, + "learning_rate": 5.383637134378975e-07, + "loss": 0.4508, + "mean_token_accuracy": 0.8500381112098694, + "num_tokens": 48421394.0, + "step": 1271 + }, + { + "epoch": 0.16181147436712887, + "grad_norm": 1.5030527114868164, + "learning_rate": 5.387876218736752e-07, + "loss": 0.421, + "mean_token_accuracy": 0.8616639375686646, + "num_tokens": 48460163.0, + "step": 1272 + }, + { + "epoch": 0.16193868464571937, + "grad_norm": 1.638259768486023, + "learning_rate": 5.392115303094531e-07, + "loss": 0.4197, + "mean_token_accuracy": 0.8606199026107788, + "num_tokens": 48500107.0, + "step": 1273 + }, + { + "epoch": 0.1620658949243099, + "grad_norm": 1.5081955194473267, + "learning_rate": 5.39635438745231e-07, + "loss": 0.4737, + "mean_token_accuracy": 0.8435308337211609, + "num_tokens": 48543172.0, + "step": 1274 + }, + { + "epoch": 0.1621931052029004, + "grad_norm": 1.562188982963562, + "learning_rate": 5.400593471810089e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8354941606521606, + "num_tokens": 48585079.0, + "step": 1275 + }, + { + "epoch": 0.1623203154814909, + "grad_norm": 1.4804174900054932, + "learning_rate": 5.404832556167867e-07, + "loss": 0.5055, + "mean_token_accuracy": 0.8330089449882507, + "num_tokens": 48627139.0, + "step": 1276 + }, + { + "epoch": 0.16244752576008142, + "grad_norm": 1.5458784103393555, + "learning_rate": 5.409071640525647e-07, + "loss": 0.4527, + "mean_token_accuracy": 0.8471845388412476, + "num_tokens": 48664895.0, + "step": 1277 + }, + { + "epoch": 0.16257473603867192, + "grad_norm": 1.615496039390564, + "learning_rate": 5.413310724883425e-07, + "loss": 0.3766, + "mean_token_accuracy": 0.8714610934257507, + "num_tokens": 48697325.0, + "step": 1278 + }, + { + "epoch": 0.16270194631726242, + "grad_norm": 1.4804482460021973, + "learning_rate": 5.417549809241205e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8477376699447632, + "num_tokens": 48737995.0, + "step": 1279 + }, + { + "epoch": 0.16282915659585295, + "grad_norm": 1.6222915649414062, + "learning_rate": 5.421788893598982e-07, + "loss": 0.4674, + "mean_token_accuracy": 0.8464773297309875, + "num_tokens": 48775130.0, + "step": 1280 + }, + { + "epoch": 0.16295636687444345, + "grad_norm": 1.6120595932006836, + "learning_rate": 5.42602797795676e-07, + "loss": 0.5216, + "mean_token_accuracy": 0.8272911906242371, + "num_tokens": 48817621.0, + "step": 1281 + }, + { + "epoch": 0.16308357715303395, + "grad_norm": 1.6108301877975464, + "learning_rate": 5.43026706231454e-07, + "loss": 0.4137, + "mean_token_accuracy": 0.8615269064903259, + "num_tokens": 48849331.0, + "step": 1282 + }, + { + "epoch": 0.16321078743162448, + "grad_norm": 1.5419960021972656, + "learning_rate": 5.434506146672319e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8480937480926514, + "num_tokens": 48887266.0, + "step": 1283 + }, + { + "epoch": 0.16333799771021498, + "grad_norm": 1.5404853820800781, + "learning_rate": 5.438745231030097e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.8498075604438782, + "num_tokens": 48924988.0, + "step": 1284 + }, + { + "epoch": 0.16346520798880548, + "grad_norm": 1.619613766670227, + "learning_rate": 5.442984315387876e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.8366981744766235, + "num_tokens": 48961487.0, + "step": 1285 + }, + { + "epoch": 0.163592418267396, + "grad_norm": 1.6051031351089478, + "learning_rate": 5.447223399745655e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.8399880528450012, + "num_tokens": 49000070.0, + "step": 1286 + }, + { + "epoch": 0.1637196285459865, + "grad_norm": 1.5633198022842407, + "learning_rate": 5.451462484103433e-07, + "loss": 0.4264, + "mean_token_accuracy": 0.8583557605743408, + "num_tokens": 49036606.0, + "step": 1287 + }, + { + "epoch": 0.163846838824577, + "grad_norm": 1.6813884973526, + "learning_rate": 5.455701568461212e-07, + "loss": 0.4022, + "mean_token_accuracy": 0.8675782680511475, + "num_tokens": 49068638.0, + "step": 1288 + }, + { + "epoch": 0.16397404910316754, + "grad_norm": 1.4189739227294922, + "learning_rate": 5.45994065281899e-07, + "loss": 0.4506, + "mean_token_accuracy": 0.8528008460998535, + "num_tokens": 49111049.0, + "step": 1289 + }, + { + "epoch": 0.16410125938175804, + "grad_norm": 1.5613125562667847, + "learning_rate": 5.46417973717677e-07, + "loss": 0.4797, + "mean_token_accuracy": 0.8377096056938171, + "num_tokens": 49151935.0, + "step": 1290 + }, + { + "epoch": 0.16422846966034857, + "grad_norm": 1.558480143547058, + "learning_rate": 5.468418821534548e-07, + "loss": 0.4724, + "mean_token_accuracy": 0.8459720015525818, + "num_tokens": 49190273.0, + "step": 1291 + }, + { + "epoch": 0.16435567993893907, + "grad_norm": 1.6328396797180176, + "learning_rate": 5.472657905892327e-07, + "loss": 0.5292, + "mean_token_accuracy": 0.8280468583106995, + "num_tokens": 49225978.0, + "step": 1292 + }, + { + "epoch": 0.16448289021752957, + "grad_norm": 1.5767985582351685, + "learning_rate": 5.476896990250106e-07, + "loss": 0.4207, + "mean_token_accuracy": 0.8580994009971619, + "num_tokens": 49260285.0, + "step": 1293 + }, + { + "epoch": 0.1646101004961201, + "grad_norm": 1.545145034790039, + "learning_rate": 5.481136074607885e-07, + "loss": 0.4385, + "mean_token_accuracy": 0.8543720245361328, + "num_tokens": 49297388.0, + "step": 1294 + }, + { + "epoch": 0.1647373107747106, + "grad_norm": 1.4573633670806885, + "learning_rate": 5.485375158965663e-07, + "loss": 0.4379, + "mean_token_accuracy": 0.8534984588623047, + "num_tokens": 49340756.0, + "step": 1295 + }, + { + "epoch": 0.1648645210533011, + "grad_norm": 1.5939468145370483, + "learning_rate": 5.489614243323442e-07, + "loss": 0.4701, + "mean_token_accuracy": 0.8445737361907959, + "num_tokens": 49377786.0, + "step": 1296 + }, + { + "epoch": 0.16499173133189163, + "grad_norm": 1.4740478992462158, + "learning_rate": 5.49385332768122e-07, + "loss": 0.4638, + "mean_token_accuracy": 0.8507969975471497, + "num_tokens": 49418141.0, + "step": 1297 + }, + { + "epoch": 0.16511894161048213, + "grad_norm": 1.5938111543655396, + "learning_rate": 5.498092412039e-07, + "loss": 0.459, + "mean_token_accuracy": 0.8484835624694824, + "num_tokens": 49460422.0, + "step": 1298 + }, + { + "epoch": 0.16524615188907263, + "grad_norm": 1.626518964767456, + "learning_rate": 5.502331496396778e-07, + "loss": 0.4612, + "mean_token_accuracy": 0.8462082147598267, + "num_tokens": 49501430.0, + "step": 1299 + }, + { + "epoch": 0.16537336216766316, + "grad_norm": 1.6348036527633667, + "learning_rate": 5.506570580754557e-07, + "loss": 0.4673, + "mean_token_accuracy": 0.846248984336853, + "num_tokens": 49539471.0, + "step": 1300 + }, + { + "epoch": 0.16550057244625366, + "grad_norm": 1.4311258792877197, + "learning_rate": 5.510809665112336e-07, + "loss": 0.4382, + "mean_token_accuracy": 0.8565094470977783, + "num_tokens": 49585811.0, + "step": 1301 + }, + { + "epoch": 0.16562778272484416, + "grad_norm": 1.5455838441848755, + "learning_rate": 5.515048749470113e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.8457459211349487, + "num_tokens": 49627523.0, + "step": 1302 + }, + { + "epoch": 0.1657549930034347, + "grad_norm": 1.674538493156433, + "learning_rate": 5.519287833827893e-07, + "loss": 0.4074, + "mean_token_accuracy": 0.8600309491157532, + "num_tokens": 49658620.0, + "step": 1303 + }, + { + "epoch": 0.1658822032820252, + "grad_norm": 1.672948956489563, + "learning_rate": 5.523526918185671e-07, + "loss": 0.4766, + "mean_token_accuracy": 0.8427379131317139, + "num_tokens": 49696243.0, + "step": 1304 + }, + { + "epoch": 0.1660094135606157, + "grad_norm": 1.5087662935256958, + "learning_rate": 5.52776600254345e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8465036153793335, + "num_tokens": 49738585.0, + "step": 1305 + }, + { + "epoch": 0.16613662383920622, + "grad_norm": 1.794113278388977, + "learning_rate": 5.532005086901229e-07, + "loss": 0.5643, + "mean_token_accuracy": 0.8111116886138916, + "num_tokens": 49774497.0, + "step": 1306 + }, + { + "epoch": 0.16626383411779672, + "grad_norm": 1.527098536491394, + "learning_rate": 5.536244171259008e-07, + "loss": 0.4132, + "mean_token_accuracy": 0.8608778715133667, + "num_tokens": 49812144.0, + "step": 1307 + }, + { + "epoch": 0.16639104439638722, + "grad_norm": 1.6411902904510498, + "learning_rate": 5.540483255616786e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8420939445495605, + "num_tokens": 49848695.0, + "step": 1308 + }, + { + "epoch": 0.16651825467497774, + "grad_norm": 1.553529977798462, + "learning_rate": 5.544722339974566e-07, + "loss": 0.4974, + "mean_token_accuracy": 0.8430116176605225, + "num_tokens": 49887981.0, + "step": 1309 + }, + { + "epoch": 0.16664546495356825, + "grad_norm": 1.6135576963424683, + "learning_rate": 5.548961424332343e-07, + "loss": 0.4516, + "mean_token_accuracy": 0.8475697040557861, + "num_tokens": 49925114.0, + "step": 1310 + }, + { + "epoch": 0.16677267523215875, + "grad_norm": 1.6294889450073242, + "learning_rate": 5.553200508690123e-07, + "loss": 0.506, + "mean_token_accuracy": 0.8397586345672607, + "num_tokens": 49963745.0, + "step": 1311 + }, + { + "epoch": 0.16689988551074927, + "grad_norm": 1.5570085048675537, + "learning_rate": 5.557439593047901e-07, + "loss": 0.4324, + "mean_token_accuracy": 0.8511624336242676, + "num_tokens": 50003681.0, + "step": 1312 + }, + { + "epoch": 0.16702709578933977, + "grad_norm": 1.6040345430374146, + "learning_rate": 5.56167867740568e-07, + "loss": 0.4723, + "mean_token_accuracy": 0.8441661596298218, + "num_tokens": 50043990.0, + "step": 1313 + }, + { + "epoch": 0.16715430606793028, + "grad_norm": 1.5317672491073608, + "learning_rate": 5.565917761763459e-07, + "loss": 0.4677, + "mean_token_accuracy": 0.8444356322288513, + "num_tokens": 50083603.0, + "step": 1314 + }, + { + "epoch": 0.1672815163465208, + "grad_norm": 1.5600608587265015, + "learning_rate": 5.570156846121238e-07, + "loss": 0.454, + "mean_token_accuracy": 0.8500156402587891, + "num_tokens": 50119443.0, + "step": 1315 + }, + { + "epoch": 0.1674087266251113, + "grad_norm": 1.5937771797180176, + "learning_rate": 5.574395930479016e-07, + "loss": 0.4301, + "mean_token_accuracy": 0.8567014336585999, + "num_tokens": 50156844.0, + "step": 1316 + }, + { + "epoch": 0.16753593690370183, + "grad_norm": 1.5744184255599976, + "learning_rate": 5.578635014836796e-07, + "loss": 0.3939, + "mean_token_accuracy": 0.868826687335968, + "num_tokens": 50194947.0, + "step": 1317 + }, + { + "epoch": 0.16766314718229233, + "grad_norm": 1.652896523475647, + "learning_rate": 5.582874099194573e-07, + "loss": 0.5069, + "mean_token_accuracy": 0.8349280953407288, + "num_tokens": 50233117.0, + "step": 1318 + }, + { + "epoch": 0.16779035746088283, + "grad_norm": 1.6335077285766602, + "learning_rate": 5.587113183552353e-07, + "loss": 0.4937, + "mean_token_accuracy": 0.8374722003936768, + "num_tokens": 50272813.0, + "step": 1319 + }, + { + "epoch": 0.16791756773947336, + "grad_norm": 1.5735334157943726, + "learning_rate": 5.591352267910131e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.8502005934715271, + "num_tokens": 50309356.0, + "step": 1320 + }, + { + "epoch": 0.16804477801806386, + "grad_norm": 1.5158017873764038, + "learning_rate": 5.59559135226791e-07, + "loss": 0.4361, + "mean_token_accuracy": 0.8574454188346863, + "num_tokens": 50350435.0, + "step": 1321 + }, + { + "epoch": 0.16817198829665436, + "grad_norm": 1.6863309144973755, + "learning_rate": 5.599830436625689e-07, + "loss": 0.4804, + "mean_token_accuracy": 0.840351939201355, + "num_tokens": 50386405.0, + "step": 1322 + }, + { + "epoch": 0.1682991985752449, + "grad_norm": 1.4897253513336182, + "learning_rate": 5.604069520983468e-07, + "loss": 0.4631, + "mean_token_accuracy": 0.8481674194335938, + "num_tokens": 50432013.0, + "step": 1323 + }, + { + "epoch": 0.1684264088538354, + "grad_norm": 1.5375173091888428, + "learning_rate": 5.608308605341246e-07, + "loss": 0.467, + "mean_token_accuracy": 0.8487522602081299, + "num_tokens": 50474687.0, + "step": 1324 + }, + { + "epoch": 0.1685536191324259, + "grad_norm": 1.5490409135818481, + "learning_rate": 5.612547689699024e-07, + "loss": 0.4374, + "mean_token_accuracy": 0.8577081561088562, + "num_tokens": 50514230.0, + "step": 1325 + }, + { + "epoch": 0.16868082941101642, + "grad_norm": 1.5598911046981812, + "learning_rate": 5.616786774056803e-07, + "loss": 0.461, + "mean_token_accuracy": 0.8460808396339417, + "num_tokens": 50551861.0, + "step": 1326 + }, + { + "epoch": 0.16880803968960692, + "grad_norm": 1.464034080505371, + "learning_rate": 5.621025858414582e-07, + "loss": 0.4375, + "mean_token_accuracy": 0.8535473346710205, + "num_tokens": 50595318.0, + "step": 1327 + }, + { + "epoch": 0.16893524996819742, + "grad_norm": 1.4922246932983398, + "learning_rate": 5.625264942772361e-07, + "loss": 0.3908, + "mean_token_accuracy": 0.8727021217346191, + "num_tokens": 50631939.0, + "step": 1328 + }, + { + "epoch": 0.16906246024678795, + "grad_norm": 1.65871000289917, + "learning_rate": 5.629504027130139e-07, + "loss": 0.4464, + "mean_token_accuracy": 0.851302981376648, + "num_tokens": 50667466.0, + "step": 1329 + }, + { + "epoch": 0.16918967052537845, + "grad_norm": 1.4545273780822754, + "learning_rate": 5.633743111487919e-07, + "loss": 0.445, + "mean_token_accuracy": 0.8541926145553589, + "num_tokens": 50707390.0, + "step": 1330 + }, + { + "epoch": 0.16931688080396895, + "grad_norm": 1.5812978744506836, + "learning_rate": 5.637982195845697e-07, + "loss": 0.4599, + "mean_token_accuracy": 0.8470350503921509, + "num_tokens": 50747332.0, + "step": 1331 + }, + { + "epoch": 0.16944409108255948, + "grad_norm": 1.516466498374939, + "learning_rate": 5.642221280203476e-07, + "loss": 0.4595, + "mean_token_accuracy": 0.8473681211471558, + "num_tokens": 50791935.0, + "step": 1332 + }, + { + "epoch": 0.16957130136114998, + "grad_norm": 1.4774006605148315, + "learning_rate": 5.646460364561254e-07, + "loss": 0.4182, + "mean_token_accuracy": 0.8604958653450012, + "num_tokens": 50831972.0, + "step": 1333 + }, + { + "epoch": 0.16969851163974048, + "grad_norm": 1.507633924484253, + "learning_rate": 5.650699448919033e-07, + "loss": 0.4182, + "mean_token_accuracy": 0.8609111309051514, + "num_tokens": 50871084.0, + "step": 1334 + }, + { + "epoch": 0.169825721918331, + "grad_norm": 1.5401208400726318, + "learning_rate": 5.654938533276812e-07, + "loss": 0.4494, + "mean_token_accuracy": 0.8513410091400146, + "num_tokens": 50915108.0, + "step": 1335 + }, + { + "epoch": 0.1699529321969215, + "grad_norm": 1.6760058403015137, + "learning_rate": 5.659177617634591e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.8413749933242798, + "num_tokens": 50954349.0, + "step": 1336 + }, + { + "epoch": 0.170080142475512, + "grad_norm": 1.4596920013427734, + "learning_rate": 5.663416701992369e-07, + "loss": 0.4018, + "mean_token_accuracy": 0.8644853830337524, + "num_tokens": 50993330.0, + "step": 1337 + }, + { + "epoch": 0.17020735275410254, + "grad_norm": 1.549394965171814, + "learning_rate": 5.667655786350149e-07, + "loss": 0.4467, + "mean_token_accuracy": 0.8499910831451416, + "num_tokens": 51035424.0, + "step": 1338 + }, + { + "epoch": 0.17033456303269304, + "grad_norm": 1.4778711795806885, + "learning_rate": 5.671894870707927e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.8394142389297485, + "num_tokens": 51077932.0, + "step": 1339 + }, + { + "epoch": 0.17046177331128357, + "grad_norm": 1.4680209159851074, + "learning_rate": 5.676133955065705e-07, + "loss": 0.423, + "mean_token_accuracy": 0.8599289059638977, + "num_tokens": 51118225.0, + "step": 1340 + }, + { + "epoch": 0.17058898358987407, + "grad_norm": 1.6085474491119385, + "learning_rate": 5.680373039423484e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.845192551612854, + "num_tokens": 51158297.0, + "step": 1341 + }, + { + "epoch": 0.17071619386846457, + "grad_norm": 1.5746294260025024, + "learning_rate": 5.684612123781263e-07, + "loss": 0.5147, + "mean_token_accuracy": 0.8339805603027344, + "num_tokens": 51197327.0, + "step": 1342 + }, + { + "epoch": 0.1708434041470551, + "grad_norm": 1.7204806804656982, + "learning_rate": 5.688851208139042e-07, + "loss": 0.4802, + "mean_token_accuracy": 0.8404199481010437, + "num_tokens": 51231126.0, + "step": 1343 + }, + { + "epoch": 0.1709706144256456, + "grad_norm": 1.6091034412384033, + "learning_rate": 5.69309029249682e-07, + "loss": 0.5, + "mean_token_accuracy": 0.8344092965126038, + "num_tokens": 51268493.0, + "step": 1344 + }, + { + "epoch": 0.1710978247042361, + "grad_norm": 1.4956074953079224, + "learning_rate": 5.697329376854599e-07, + "loss": 0.4175, + "mean_token_accuracy": 0.8596384525299072, + "num_tokens": 51305126.0, + "step": 1345 + }, + { + "epoch": 0.17122503498282662, + "grad_norm": 1.3924111127853394, + "learning_rate": 5.701568461212378e-07, + "loss": 0.4473, + "mean_token_accuracy": 0.8519570231437683, + "num_tokens": 51349179.0, + "step": 1346 + }, + { + "epoch": 0.17135224526141712, + "grad_norm": 1.4343289136886597, + "learning_rate": 5.705807545570157e-07, + "loss": 0.4424, + "mean_token_accuracy": 0.8536195755004883, + "num_tokens": 51393040.0, + "step": 1347 + }, + { + "epoch": 0.17147945554000762, + "grad_norm": 1.5708287954330444, + "learning_rate": 5.710046629927934e-07, + "loss": 0.3942, + "mean_token_accuracy": 0.866710364818573, + "num_tokens": 51430392.0, + "step": 1348 + }, + { + "epoch": 0.17160666581859815, + "grad_norm": 1.6542755365371704, + "learning_rate": 5.714285714285714e-07, + "loss": 0.4653, + "mean_token_accuracy": 0.8452663421630859, + "num_tokens": 51466355.0, + "step": 1349 + }, + { + "epoch": 0.17173387609718865, + "grad_norm": 1.8384724855422974, + "learning_rate": 5.718524798643492e-07, + "loss": 0.498, + "mean_token_accuracy": 0.8376575112342834, + "num_tokens": 51499559.0, + "step": 1350 + }, + { + "epoch": 0.17186108637577915, + "grad_norm": 1.5216914415359497, + "learning_rate": 5.722763883001272e-07, + "loss": 0.4362, + "mean_token_accuracy": 0.8569056987762451, + "num_tokens": 51538934.0, + "step": 1351 + }, + { + "epoch": 0.17198829665436968, + "grad_norm": 1.543118953704834, + "learning_rate": 5.72700296735905e-07, + "loss": 0.4681, + "mean_token_accuracy": 0.8475372791290283, + "num_tokens": 51581229.0, + "step": 1352 + }, + { + "epoch": 0.17211550693296018, + "grad_norm": 1.5756640434265137, + "learning_rate": 5.731242051716829e-07, + "loss": 0.465, + "mean_token_accuracy": 0.8466393947601318, + "num_tokens": 51621285.0, + "step": 1353 + }, + { + "epoch": 0.17224271721155068, + "grad_norm": 1.5679377317428589, + "learning_rate": 5.735481136074608e-07, + "loss": 0.4671, + "mean_token_accuracy": 0.8461781144142151, + "num_tokens": 51659864.0, + "step": 1354 + }, + { + "epoch": 0.1723699274901412, + "grad_norm": 1.5639652013778687, + "learning_rate": 5.739720220432386e-07, + "loss": 0.484, + "mean_token_accuracy": 0.843055009841919, + "num_tokens": 51702288.0, + "step": 1355 + }, + { + "epoch": 0.1724971377687317, + "grad_norm": 1.5847889184951782, + "learning_rate": 5.743959304790164e-07, + "loss": 0.4489, + "mean_token_accuracy": 0.8485977649688721, + "num_tokens": 51740628.0, + "step": 1356 + }, + { + "epoch": 0.1726243480473222, + "grad_norm": 1.5443496704101562, + "learning_rate": 5.748198389147944e-07, + "loss": 0.483, + "mean_token_accuracy": 0.8392539024353027, + "num_tokens": 51777288.0, + "step": 1357 + }, + { + "epoch": 0.17275155832591274, + "grad_norm": 1.6149725914001465, + "learning_rate": 5.752437473505722e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.8304553031921387, + "num_tokens": 51816989.0, + "step": 1358 + }, + { + "epoch": 0.17287876860450324, + "grad_norm": 1.5246796607971191, + "learning_rate": 5.756676557863502e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.8410847187042236, + "num_tokens": 51859427.0, + "step": 1359 + }, + { + "epoch": 0.17300597888309374, + "grad_norm": 1.7475032806396484, + "learning_rate": 5.76091564222128e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.8358702659606934, + "num_tokens": 51894503.0, + "step": 1360 + }, + { + "epoch": 0.17313318916168427, + "grad_norm": 1.4923611879348755, + "learning_rate": 5.765154726579059e-07, + "loss": 0.4115, + "mean_token_accuracy": 0.8603798151016235, + "num_tokens": 51934066.0, + "step": 1361 + }, + { + "epoch": 0.17326039944027477, + "grad_norm": 1.5397566556930542, + "learning_rate": 5.769393810936838e-07, + "loss": 0.4146, + "mean_token_accuracy": 0.8593404293060303, + "num_tokens": 51973725.0, + "step": 1362 + }, + { + "epoch": 0.17338760971886527, + "grad_norm": 1.592862844467163, + "learning_rate": 5.773632895294616e-07, + "loss": 0.4753, + "mean_token_accuracy": 0.843140721321106, + "num_tokens": 52012456.0, + "step": 1363 + }, + { + "epoch": 0.1735148199974558, + "grad_norm": 1.5362613201141357, + "learning_rate": 5.777871979652394e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.8369374871253967, + "num_tokens": 52053915.0, + "step": 1364 + }, + { + "epoch": 0.1736420302760463, + "grad_norm": 1.749302864074707, + "learning_rate": 5.782111064010173e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8453197479248047, + "num_tokens": 52084755.0, + "step": 1365 + }, + { + "epoch": 0.17376924055463683, + "grad_norm": 1.5706791877746582, + "learning_rate": 5.786350148367952e-07, + "loss": 0.516, + "mean_token_accuracy": 0.8301742076873779, + "num_tokens": 52126716.0, + "step": 1366 + }, + { + "epoch": 0.17389645083322733, + "grad_norm": 1.5537325143814087, + "learning_rate": 5.790589232725731e-07, + "loss": 0.4639, + "mean_token_accuracy": 0.8479197025299072, + "num_tokens": 52165149.0, + "step": 1367 + }, + { + "epoch": 0.17402366111181783, + "grad_norm": 1.5162155628204346, + "learning_rate": 5.79482831708351e-07, + "loss": 0.4357, + "mean_token_accuracy": 0.8542967438697815, + "num_tokens": 52207009.0, + "step": 1368 + }, + { + "epoch": 0.17415087139040836, + "grad_norm": 1.5505831241607666, + "learning_rate": 5.799067401441288e-07, + "loss": 0.473, + "mean_token_accuracy": 0.844719648361206, + "num_tokens": 52250115.0, + "step": 1369 + }, + { + "epoch": 0.17427808166899886, + "grad_norm": 1.4923352003097534, + "learning_rate": 5.803306485799068e-07, + "loss": 0.4144, + "mean_token_accuracy": 0.8602458238601685, + "num_tokens": 52292410.0, + "step": 1370 + }, + { + "epoch": 0.17440529194758936, + "grad_norm": 1.6569156646728516, + "learning_rate": 5.807545570156845e-07, + "loss": 0.5285, + "mean_token_accuracy": 0.8303970694541931, + "num_tokens": 52328473.0, + "step": 1371 + }, + { + "epoch": 0.1745325022261799, + "grad_norm": 1.6080113649368286, + "learning_rate": 5.811784654514624e-07, + "loss": 0.41, + "mean_token_accuracy": 0.8640539050102234, + "num_tokens": 52364234.0, + "step": 1372 + }, + { + "epoch": 0.1746597125047704, + "grad_norm": 1.5990146398544312, + "learning_rate": 5.816023738872403e-07, + "loss": 0.4618, + "mean_token_accuracy": 0.8454670310020447, + "num_tokens": 52402487.0, + "step": 1373 + }, + { + "epoch": 0.1747869227833609, + "grad_norm": 1.6443917751312256, + "learning_rate": 5.820262823230182e-07, + "loss": 0.4311, + "mean_token_accuracy": 0.856493353843689, + "num_tokens": 52440262.0, + "step": 1374 + }, + { + "epoch": 0.17491413306195142, + "grad_norm": 1.5999406576156616, + "learning_rate": 5.824501907587961e-07, + "loss": 0.4233, + "mean_token_accuracy": 0.8596192002296448, + "num_tokens": 52480251.0, + "step": 1375 + }, + { + "epoch": 0.17504134334054192, + "grad_norm": 1.6551165580749512, + "learning_rate": 5.82874099194574e-07, + "loss": 0.5122, + "mean_token_accuracy": 0.8314123749732971, + "num_tokens": 52516104.0, + "step": 1376 + }, + { + "epoch": 0.17516855361913242, + "grad_norm": 1.5636825561523438, + "learning_rate": 5.832980076303518e-07, + "loss": 0.4776, + "mean_token_accuracy": 0.8470480442047119, + "num_tokens": 52556071.0, + "step": 1377 + }, + { + "epoch": 0.17529576389772294, + "grad_norm": 1.5588446855545044, + "learning_rate": 5.837219160661297e-07, + "loss": 0.4596, + "mean_token_accuracy": 0.8457767963409424, + "num_tokens": 52590598.0, + "step": 1378 + }, + { + "epoch": 0.17542297417631345, + "grad_norm": 1.5813300609588623, + "learning_rate": 5.841458245019075e-07, + "loss": 0.4465, + "mean_token_accuracy": 0.8483370542526245, + "num_tokens": 52627162.0, + "step": 1379 + }, + { + "epoch": 0.17555018445490395, + "grad_norm": 1.4981094598770142, + "learning_rate": 5.845697329376855e-07, + "loss": 0.3987, + "mean_token_accuracy": 0.8676296472549438, + "num_tokens": 52662134.0, + "step": 1380 + }, + { + "epoch": 0.17567739473349447, + "grad_norm": 1.460315465927124, + "learning_rate": 5.849936413734633e-07, + "loss": 0.453, + "mean_token_accuracy": 0.8523035645484924, + "num_tokens": 52705871.0, + "step": 1381 + }, + { + "epoch": 0.17580460501208497, + "grad_norm": 1.638598084449768, + "learning_rate": 5.854175498092412e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.8398483991622925, + "num_tokens": 52740141.0, + "step": 1382 + }, + { + "epoch": 0.17593181529067548, + "grad_norm": 1.4717479944229126, + "learning_rate": 5.858414582450191e-07, + "loss": 0.4182, + "mean_token_accuracy": 0.8614944219589233, + "num_tokens": 52780789.0, + "step": 1383 + }, + { + "epoch": 0.176059025569266, + "grad_norm": 1.4729094505310059, + "learning_rate": 5.86265366680797e-07, + "loss": 0.4578, + "mean_token_accuracy": 0.8476694226264954, + "num_tokens": 52820828.0, + "step": 1384 + }, + { + "epoch": 0.1761862358478565, + "grad_norm": 1.7837060689926147, + "learning_rate": 5.866892751165748e-07, + "loss": 0.4297, + "mean_token_accuracy": 0.8557611107826233, + "num_tokens": 52851778.0, + "step": 1385 + }, + { + "epoch": 0.176313446126447, + "grad_norm": 1.504289150238037, + "learning_rate": 5.871131835523526e-07, + "loss": 0.508, + "mean_token_accuracy": 0.8339294195175171, + "num_tokens": 52893678.0, + "step": 1386 + }, + { + "epoch": 0.17644065640503753, + "grad_norm": 1.516908049583435, + "learning_rate": 5.875370919881305e-07, + "loss": 0.4226, + "mean_token_accuracy": 0.8593565225601196, + "num_tokens": 52933288.0, + "step": 1387 + }, + { + "epoch": 0.17656786668362803, + "grad_norm": 1.5841063261032104, + "learning_rate": 5.879610004239084e-07, + "loss": 0.4241, + "mean_token_accuracy": 0.859474778175354, + "num_tokens": 52972865.0, + "step": 1388 + }, + { + "epoch": 0.17669507696221853, + "grad_norm": 1.5961834192276, + "learning_rate": 5.883849088596863e-07, + "loss": 0.5524, + "mean_token_accuracy": 0.8261034488677979, + "num_tokens": 53014833.0, + "step": 1389 + }, + { + "epoch": 0.17682228724080906, + "grad_norm": 1.4467692375183105, + "learning_rate": 5.888088172954641e-07, + "loss": 0.4685, + "mean_token_accuracy": 0.8469036817550659, + "num_tokens": 53059592.0, + "step": 1390 + }, + { + "epoch": 0.17694949751939956, + "grad_norm": 1.5958713293075562, + "learning_rate": 5.892327257312421e-07, + "loss": 0.4686, + "mean_token_accuracy": 0.8445481657981873, + "num_tokens": 53093533.0, + "step": 1391 + }, + { + "epoch": 0.1770767077979901, + "grad_norm": 1.5366978645324707, + "learning_rate": 5.896566341670199e-07, + "loss": 0.4965, + "mean_token_accuracy": 0.8403372764587402, + "num_tokens": 53137266.0, + "step": 1392 + }, + { + "epoch": 0.1772039180765806, + "grad_norm": 1.618545651435852, + "learning_rate": 5.900805426027977e-07, + "loss": 0.4516, + "mean_token_accuracy": 0.8486751914024353, + "num_tokens": 53171892.0, + "step": 1393 + }, + { + "epoch": 0.1773311283551711, + "grad_norm": 1.6034376621246338, + "learning_rate": 5.905044510385756e-07, + "loss": 0.4241, + "mean_token_accuracy": 0.8588134050369263, + "num_tokens": 53205142.0, + "step": 1394 + }, + { + "epoch": 0.17745833863376162, + "grad_norm": 1.4824403524398804, + "learning_rate": 5.909283594743535e-07, + "loss": 0.4487, + "mean_token_accuracy": 0.8538655638694763, + "num_tokens": 53245657.0, + "step": 1395 + }, + { + "epoch": 0.17758554891235212, + "grad_norm": 1.6075342893600464, + "learning_rate": 5.913522679101314e-07, + "loss": 0.4762, + "mean_token_accuracy": 0.840059757232666, + "num_tokens": 53284927.0, + "step": 1396 + }, + { + "epoch": 0.17771275919094262, + "grad_norm": 1.687374234199524, + "learning_rate": 5.917761763459093e-07, + "loss": 0.4517, + "mean_token_accuracy": 0.8557697534561157, + "num_tokens": 53318771.0, + "step": 1397 + }, + { + "epoch": 0.17783996946953315, + "grad_norm": 1.4144082069396973, + "learning_rate": 5.922000847816871e-07, + "loss": 0.524, + "mean_token_accuracy": 0.8348769545555115, + "num_tokens": 53364878.0, + "step": 1398 + }, + { + "epoch": 0.17796717974812365, + "grad_norm": 1.5307775735855103, + "learning_rate": 5.926239932174651e-07, + "loss": 0.4277, + "mean_token_accuracy": 0.8545680046081543, + "num_tokens": 53401916.0, + "step": 1399 + }, + { + "epoch": 0.17809439002671415, + "grad_norm": 1.5623633861541748, + "learning_rate": 5.930479016532429e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8472386598587036, + "num_tokens": 53441110.0, + "step": 1400 + }, + { + "epoch": 0.17822160030530468, + "grad_norm": 1.5771397352218628, + "learning_rate": 5.934718100890207e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.8372893929481506, + "num_tokens": 53480226.0, + "step": 1401 + }, + { + "epoch": 0.17834881058389518, + "grad_norm": 1.574775218963623, + "learning_rate": 5.938957185247986e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8402153849601746, + "num_tokens": 53522260.0, + "step": 1402 + }, + { + "epoch": 0.17847602086248568, + "grad_norm": 1.5744165182113647, + "learning_rate": 5.943196269605765e-07, + "loss": 0.4312, + "mean_token_accuracy": 0.8603086471557617, + "num_tokens": 53558664.0, + "step": 1403 + }, + { + "epoch": 0.1786032311410762, + "grad_norm": 1.4981694221496582, + "learning_rate": 5.947435353963544e-07, + "loss": 0.4259, + "mean_token_accuracy": 0.8573994636535645, + "num_tokens": 53597020.0, + "step": 1404 + }, + { + "epoch": 0.1787304414196667, + "grad_norm": 1.551745057106018, + "learning_rate": 5.951674438321323e-07, + "loss": 0.4634, + "mean_token_accuracy": 0.8481040596961975, + "num_tokens": 53636752.0, + "step": 1405 + }, + { + "epoch": 0.1788576516982572, + "grad_norm": 1.4165347814559937, + "learning_rate": 5.955913522679101e-07, + "loss": 0.4213, + "mean_token_accuracy": 0.8593548536300659, + "num_tokens": 53677980.0, + "step": 1406 + }, + { + "epoch": 0.17898486197684774, + "grad_norm": 1.5074093341827393, + "learning_rate": 5.96015260703688e-07, + "loss": 0.4425, + "mean_token_accuracy": 0.8538112640380859, + "num_tokens": 53717272.0, + "step": 1407 + }, + { + "epoch": 0.17911207225543824, + "grad_norm": 1.4946575164794922, + "learning_rate": 5.964391691394659e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.85771244764328, + "num_tokens": 53761811.0, + "step": 1408 + }, + { + "epoch": 0.17923928253402874, + "grad_norm": 1.4802920818328857, + "learning_rate": 5.968630775752436e-07, + "loss": 0.5139, + "mean_token_accuracy": 0.8339971303939819, + "num_tokens": 53802684.0, + "step": 1409 + }, + { + "epoch": 0.17936649281261927, + "grad_norm": 1.558812141418457, + "learning_rate": 5.972869860110216e-07, + "loss": 0.4911, + "mean_token_accuracy": 0.8397713899612427, + "num_tokens": 53846219.0, + "step": 1410 + }, + { + "epoch": 0.17949370309120977, + "grad_norm": 1.5874162912368774, + "learning_rate": 5.977108944467994e-07, + "loss": 0.4168, + "mean_token_accuracy": 0.8571348190307617, + "num_tokens": 53881174.0, + "step": 1411 + }, + { + "epoch": 0.17962091336980027, + "grad_norm": 1.6151212453842163, + "learning_rate": 5.981348028825774e-07, + "loss": 0.4064, + "mean_token_accuracy": 0.8574848771095276, + "num_tokens": 53915665.0, + "step": 1412 + }, + { + "epoch": 0.1797481236483908, + "grad_norm": 1.5684128999710083, + "learning_rate": 5.985587113183552e-07, + "loss": 0.4181, + "mean_token_accuracy": 0.859321117401123, + "num_tokens": 53950529.0, + "step": 1413 + }, + { + "epoch": 0.1798753339269813, + "grad_norm": 1.5561927556991577, + "learning_rate": 5.989826197541331e-07, + "loss": 0.4659, + "mean_token_accuracy": 0.8430798053741455, + "num_tokens": 53990010.0, + "step": 1414 + }, + { + "epoch": 0.18000254420557182, + "grad_norm": 1.7334485054016113, + "learning_rate": 5.99406528189911e-07, + "loss": 0.5083, + "mean_token_accuracy": 0.8305818438529968, + "num_tokens": 54025506.0, + "step": 1415 + }, + { + "epoch": 0.18012975448416232, + "grad_norm": 1.620658040046692, + "learning_rate": 5.998304366256888e-07, + "loss": 0.4895, + "mean_token_accuracy": 0.8374650478363037, + "num_tokens": 54063246.0, + "step": 1416 + }, + { + "epoch": 0.18025696476275282, + "grad_norm": 1.5922291278839111, + "learning_rate": 6.002543450614666e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8370026350021362, + "num_tokens": 54102336.0, + "step": 1417 + }, + { + "epoch": 0.18038417504134335, + "grad_norm": 1.5511105060577393, + "learning_rate": 6.006782534972446e-07, + "loss": 0.4427, + "mean_token_accuracy": 0.8514731526374817, + "num_tokens": 54141850.0, + "step": 1418 + }, + { + "epoch": 0.18051138531993385, + "grad_norm": 1.5232595205307007, + "learning_rate": 6.011021619330224e-07, + "loss": 0.435, + "mean_token_accuracy": 0.8513650894165039, + "num_tokens": 54178743.0, + "step": 1419 + }, + { + "epoch": 0.18063859559852435, + "grad_norm": 1.584142804145813, + "learning_rate": 6.015260703688004e-07, + "loss": 0.4217, + "mean_token_accuracy": 0.8599068522453308, + "num_tokens": 54212993.0, + "step": 1420 + }, + { + "epoch": 0.18076580587711488, + "grad_norm": 1.5553936958312988, + "learning_rate": 6.019499788045782e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8438042998313904, + "num_tokens": 54252584.0, + "step": 1421 + }, + { + "epoch": 0.18089301615570538, + "grad_norm": 1.660139560699463, + "learning_rate": 6.023738872403561e-07, + "loss": 0.4975, + "mean_token_accuracy": 0.8358684778213501, + "num_tokens": 54290212.0, + "step": 1422 + }, + { + "epoch": 0.18102022643429588, + "grad_norm": 1.7546290159225464, + "learning_rate": 6.02797795676134e-07, + "loss": 0.5171, + "mean_token_accuracy": 0.8303941488265991, + "num_tokens": 54323777.0, + "step": 1423 + }, + { + "epoch": 0.1811474367128864, + "grad_norm": 1.7114273309707642, + "learning_rate": 6.032217041119118e-07, + "loss": 0.4756, + "mean_token_accuracy": 0.8434442281723022, + "num_tokens": 54361809.0, + "step": 1424 + }, + { + "epoch": 0.1812746469914769, + "grad_norm": 1.5183067321777344, + "learning_rate": 6.036456125476896e-07, + "loss": 0.4588, + "mean_token_accuracy": 0.8482610583305359, + "num_tokens": 54400611.0, + "step": 1425 + }, + { + "epoch": 0.1814018572700674, + "grad_norm": 1.3912858963012695, + "learning_rate": 6.040695209834675e-07, + "loss": 0.4234, + "mean_token_accuracy": 0.8590530753135681, + "num_tokens": 54444399.0, + "step": 1426 + }, + { + "epoch": 0.18152906754865794, + "grad_norm": 1.586721420288086, + "learning_rate": 6.044934294192454e-07, + "loss": 0.488, + "mean_token_accuracy": 0.8411383032798767, + "num_tokens": 54484834.0, + "step": 1427 + }, + { + "epoch": 0.18165627782724844, + "grad_norm": 1.710048794746399, + "learning_rate": 6.049173378550233e-07, + "loss": 0.4683, + "mean_token_accuracy": 0.846957802772522, + "num_tokens": 54524539.0, + "step": 1428 + }, + { + "epoch": 0.18178348810583894, + "grad_norm": 1.3967869281768799, + "learning_rate": 6.053412462908012e-07, + "loss": 0.4612, + "mean_token_accuracy": 0.8460408449172974, + "num_tokens": 54567337.0, + "step": 1429 + }, + { + "epoch": 0.18191069838442947, + "grad_norm": 1.6039434671401978, + "learning_rate": 6.05765154726579e-07, + "loss": 0.4685, + "mean_token_accuracy": 0.8476129770278931, + "num_tokens": 54605568.0, + "step": 1430 + }, + { + "epoch": 0.18203790866301997, + "grad_norm": 1.584766149520874, + "learning_rate": 6.061890631623569e-07, + "loss": 0.5029, + "mean_token_accuracy": 0.8342195749282837, + "num_tokens": 54644502.0, + "step": 1431 + }, + { + "epoch": 0.18216511894161047, + "grad_norm": 1.5964112281799316, + "learning_rate": 6.066129715981347e-07, + "loss": 0.3827, + "mean_token_accuracy": 0.8698489665985107, + "num_tokens": 54682290.0, + "step": 1432 + }, + { + "epoch": 0.182292329220201, + "grad_norm": 1.5009424686431885, + "learning_rate": 6.070368800339126e-07, + "loss": 0.4542, + "mean_token_accuracy": 0.8512712121009827, + "num_tokens": 54725021.0, + "step": 1433 + }, + { + "epoch": 0.1824195394987915, + "grad_norm": 1.5908136367797852, + "learning_rate": 6.074607884696905e-07, + "loss": 0.4344, + "mean_token_accuracy": 0.856937825679779, + "num_tokens": 54762524.0, + "step": 1434 + }, + { + "epoch": 0.182546749777382, + "grad_norm": 1.5212762355804443, + "learning_rate": 6.078846969054684e-07, + "loss": 0.4422, + "mean_token_accuracy": 0.8519431352615356, + "num_tokens": 54802125.0, + "step": 1435 + }, + { + "epoch": 0.18267396005597253, + "grad_norm": 1.4238821268081665, + "learning_rate": 6.083086053412463e-07, + "loss": 0.4742, + "mean_token_accuracy": 0.847360372543335, + "num_tokens": 54848414.0, + "step": 1436 + }, + { + "epoch": 0.18280117033456303, + "grad_norm": 1.7493003606796265, + "learning_rate": 6.087325137770242e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.845665693283081, + "num_tokens": 54879231.0, + "step": 1437 + }, + { + "epoch": 0.18292838061315353, + "grad_norm": 1.4766812324523926, + "learning_rate": 6.09156422212802e-07, + "loss": 0.4405, + "mean_token_accuracy": 0.8552449941635132, + "num_tokens": 54922277.0, + "step": 1438 + }, + { + "epoch": 0.18305559089174406, + "grad_norm": 1.6244395971298218, + "learning_rate": 6.095803306485799e-07, + "loss": 0.5197, + "mean_token_accuracy": 0.8283654451370239, + "num_tokens": 54960332.0, + "step": 1439 + }, + { + "epoch": 0.18318280117033456, + "grad_norm": 1.6291553974151611, + "learning_rate": 6.100042390843577e-07, + "loss": 0.4452, + "mean_token_accuracy": 0.8518565893173218, + "num_tokens": 54995007.0, + "step": 1440 + }, + { + "epoch": 0.1833100114489251, + "grad_norm": 1.4569714069366455, + "learning_rate": 6.104281475201356e-07, + "loss": 0.4477, + "mean_token_accuracy": 0.8476132154464722, + "num_tokens": 55038953.0, + "step": 1441 + }, + { + "epoch": 0.1834372217275156, + "grad_norm": 1.4727089405059814, + "learning_rate": 6.108520559559135e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.8346934914588928, + "num_tokens": 55081550.0, + "step": 1442 + }, + { + "epoch": 0.1835644320061061, + "grad_norm": 1.780694603919983, + "learning_rate": 6.112759643916914e-07, + "loss": 0.5223, + "mean_token_accuracy": 0.8322901725769043, + "num_tokens": 55117100.0, + "step": 1443 + }, + { + "epoch": 0.18369164228469662, + "grad_norm": 1.6260318756103516, + "learning_rate": 6.116998728274693e-07, + "loss": 0.5256, + "mean_token_accuracy": 0.8278912305831909, + "num_tokens": 55157105.0, + "step": 1444 + }, + { + "epoch": 0.18381885256328712, + "grad_norm": 1.5475293397903442, + "learning_rate": 6.121237812632472e-07, + "loss": 0.449, + "mean_token_accuracy": 0.8498642444610596, + "num_tokens": 55196534.0, + "step": 1445 + }, + { + "epoch": 0.18394606284187762, + "grad_norm": 1.5036087036132812, + "learning_rate": 6.125476896990249e-07, + "loss": 0.3846, + "mean_token_accuracy": 0.8704773783683777, + "num_tokens": 55231245.0, + "step": 1446 + }, + { + "epoch": 0.18407327312046814, + "grad_norm": 1.642246961593628, + "learning_rate": 6.129715981348028e-07, + "loss": 0.4954, + "mean_token_accuracy": 0.8377643823623657, + "num_tokens": 55273199.0, + "step": 1447 + }, + { + "epoch": 0.18420048339905865, + "grad_norm": 1.4611718654632568, + "learning_rate": 6.133955065705807e-07, + "loss": 0.4734, + "mean_token_accuracy": 0.8445131778717041, + "num_tokens": 55319765.0, + "step": 1448 + }, + { + "epoch": 0.18432769367764915, + "grad_norm": 1.6774861812591553, + "learning_rate": 6.138194150063585e-07, + "loss": 0.4832, + "mean_token_accuracy": 0.8423606753349304, + "num_tokens": 55356647.0, + "step": 1449 + }, + { + "epoch": 0.18445490395623967, + "grad_norm": 1.52211594581604, + "learning_rate": 6.142433234421365e-07, + "loss": 0.4204, + "mean_token_accuracy": 0.861305832862854, + "num_tokens": 55400025.0, + "step": 1450 + }, + { + "epoch": 0.18458211423483017, + "grad_norm": 1.4471479654312134, + "learning_rate": 6.146672318779143e-07, + "loss": 0.4488, + "mean_token_accuracy": 0.8493343591690063, + "num_tokens": 55441558.0, + "step": 1451 + }, + { + "epoch": 0.18470932451342068, + "grad_norm": 1.5080336332321167, + "learning_rate": 6.150911403136923e-07, + "loss": 0.4235, + "mean_token_accuracy": 0.8569917678833008, + "num_tokens": 55482691.0, + "step": 1452 + }, + { + "epoch": 0.1848365347920112, + "grad_norm": 1.497862458229065, + "learning_rate": 6.155150487494701e-07, + "loss": 0.4951, + "mean_token_accuracy": 0.8385569453239441, + "num_tokens": 55525193.0, + "step": 1453 + }, + { + "epoch": 0.1849637450706017, + "grad_norm": 1.6007200479507446, + "learning_rate": 6.159389571852479e-07, + "loss": 0.3945, + "mean_token_accuracy": 0.8691604137420654, + "num_tokens": 55561084.0, + "step": 1454 + }, + { + "epoch": 0.1850909553491922, + "grad_norm": 1.4371463060379028, + "learning_rate": 6.163628656210258e-07, + "loss": 0.4551, + "mean_token_accuracy": 0.8459645509719849, + "num_tokens": 55604049.0, + "step": 1455 + }, + { + "epoch": 0.18521816562778273, + "grad_norm": 1.5533608198165894, + "learning_rate": 6.167867740568037e-07, + "loss": 0.4475, + "mean_token_accuracy": 0.8515628576278687, + "num_tokens": 55641557.0, + "step": 1456 + }, + { + "epoch": 0.18534537590637323, + "grad_norm": 1.570263385772705, + "learning_rate": 6.172106824925815e-07, + "loss": 0.4776, + "mean_token_accuracy": 0.8462199568748474, + "num_tokens": 55679808.0, + "step": 1457 + }, + { + "epoch": 0.18547258618496373, + "grad_norm": 1.4122766256332397, + "learning_rate": 6.176345909283595e-07, + "loss": 0.4353, + "mean_token_accuracy": 0.8602422475814819, + "num_tokens": 55723219.0, + "step": 1458 + }, + { + "epoch": 0.18559979646355426, + "grad_norm": 1.572998046875, + "learning_rate": 6.180584993641373e-07, + "loss": 0.5301, + "mean_token_accuracy": 0.8256155252456665, + "num_tokens": 55764644.0, + "step": 1459 + }, + { + "epoch": 0.18572700674214476, + "grad_norm": 1.7074562311172485, + "learning_rate": 6.184824077999153e-07, + "loss": 0.42, + "mean_token_accuracy": 0.86004239320755, + "num_tokens": 55797789.0, + "step": 1460 + }, + { + "epoch": 0.18585421702073526, + "grad_norm": 1.5119718313217163, + "learning_rate": 6.189063162356931e-07, + "loss": 0.4335, + "mean_token_accuracy": 0.8563992977142334, + "num_tokens": 55837281.0, + "step": 1461 + }, + { + "epoch": 0.1859814272993258, + "grad_norm": 1.5493978261947632, + "learning_rate": 6.193302246714709e-07, + "loss": 0.5091, + "mean_token_accuracy": 0.8339213132858276, + "num_tokens": 55878629.0, + "step": 1462 + }, + { + "epoch": 0.1861086375779163, + "grad_norm": 1.5045804977416992, + "learning_rate": 6.197541331072488e-07, + "loss": 0.4482, + "mean_token_accuracy": 0.8502103686332703, + "num_tokens": 55918664.0, + "step": 1463 + }, + { + "epoch": 0.1862358478565068, + "grad_norm": 1.5283564329147339, + "learning_rate": 6.201780415430267e-07, + "loss": 0.4319, + "mean_token_accuracy": 0.8562064170837402, + "num_tokens": 55957869.0, + "step": 1464 + }, + { + "epoch": 0.18636305813509732, + "grad_norm": 1.405606746673584, + "learning_rate": 6.206019499788045e-07, + "loss": 0.4015, + "mean_token_accuracy": 0.8661692142486572, + "num_tokens": 55997685.0, + "step": 1465 + }, + { + "epoch": 0.18649026841368782, + "grad_norm": 1.5921075344085693, + "learning_rate": 6.210258584145825e-07, + "loss": 0.47, + "mean_token_accuracy": 0.8441791534423828, + "num_tokens": 56035985.0, + "step": 1466 + }, + { + "epoch": 0.18661747869227835, + "grad_norm": 1.516908049583435, + "learning_rate": 6.214497668503603e-07, + "loss": 0.4529, + "mean_token_accuracy": 0.8463565707206726, + "num_tokens": 56074801.0, + "step": 1467 + }, + { + "epoch": 0.18674468897086885, + "grad_norm": 1.4937822818756104, + "learning_rate": 6.218736752861383e-07, + "loss": 0.4316, + "mean_token_accuracy": 0.8541122674942017, + "num_tokens": 56113267.0, + "step": 1468 + }, + { + "epoch": 0.18687189924945935, + "grad_norm": 1.5496991872787476, + "learning_rate": 6.22297583721916e-07, + "loss": 0.4358, + "mean_token_accuracy": 0.8548746705055237, + "num_tokens": 56152217.0, + "step": 1469 + }, + { + "epoch": 0.18699910952804988, + "grad_norm": 1.513217568397522, + "learning_rate": 6.227214921576938e-07, + "loss": 0.4166, + "mean_token_accuracy": 0.8613948822021484, + "num_tokens": 56192252.0, + "step": 1470 + }, + { + "epoch": 0.18712631980664038, + "grad_norm": 1.5585682392120361, + "learning_rate": 6.231454005934718e-07, + "loss": 0.4139, + "mean_token_accuracy": 0.8613654375076294, + "num_tokens": 56228745.0, + "step": 1471 + }, + { + "epoch": 0.18725353008523088, + "grad_norm": 1.5486359596252441, + "learning_rate": 6.235693090292496e-07, + "loss": 0.397, + "mean_token_accuracy": 0.8651251792907715, + "num_tokens": 56268465.0, + "step": 1472 + }, + { + "epoch": 0.1873807403638214, + "grad_norm": 1.6516169309616089, + "learning_rate": 6.239932174650275e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8356383442878723, + "num_tokens": 56305819.0, + "step": 1473 + }, + { + "epoch": 0.1875079506424119, + "grad_norm": 1.7429580688476562, + "learning_rate": 6.244171259008054e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8385206460952759, + "num_tokens": 56338678.0, + "step": 1474 + }, + { + "epoch": 0.1876351609210024, + "grad_norm": 1.6793335676193237, + "learning_rate": 6.248410343365833e-07, + "loss": 0.4329, + "mean_token_accuracy": 0.8563815355300903, + "num_tokens": 56375279.0, + "step": 1475 + }, + { + "epoch": 0.18776237119959294, + "grad_norm": 1.534112572669983, + "learning_rate": 6.252649427723612e-07, + "loss": 0.4403, + "mean_token_accuracy": 0.8531745672225952, + "num_tokens": 56415600.0, + "step": 1476 + }, + { + "epoch": 0.18788958147818344, + "grad_norm": 1.5486445426940918, + "learning_rate": 6.25688851208139e-07, + "loss": 0.4355, + "mean_token_accuracy": 0.8561724424362183, + "num_tokens": 56453195.0, + "step": 1477 + }, + { + "epoch": 0.18801679175677394, + "grad_norm": 1.4777578115463257, + "learning_rate": 6.261127596439168e-07, + "loss": 0.5251, + "mean_token_accuracy": 0.8274778723716736, + "num_tokens": 56500752.0, + "step": 1478 + }, + { + "epoch": 0.18814400203536447, + "grad_norm": 1.458317518234253, + "learning_rate": 6.265366680796948e-07, + "loss": 0.4776, + "mean_token_accuracy": 0.8465375900268555, + "num_tokens": 56546970.0, + "step": 1479 + }, + { + "epoch": 0.18827121231395497, + "grad_norm": 1.5718278884887695, + "learning_rate": 6.269605765154726e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.8388991951942444, + "num_tokens": 56584996.0, + "step": 1480 + }, + { + "epoch": 0.18839842259254547, + "grad_norm": 1.59600830078125, + "learning_rate": 6.273844849512505e-07, + "loss": 0.4359, + "mean_token_accuracy": 0.8552783131599426, + "num_tokens": 56622665.0, + "step": 1481 + }, + { + "epoch": 0.188525632871136, + "grad_norm": 1.658724308013916, + "learning_rate": 6.278083933870284e-07, + "loss": 0.4425, + "mean_token_accuracy": 0.8538750410079956, + "num_tokens": 56658201.0, + "step": 1482 + }, + { + "epoch": 0.1886528431497265, + "grad_norm": 1.56673002243042, + "learning_rate": 6.282323018228063e-07, + "loss": 0.4269, + "mean_token_accuracy": 0.8566838502883911, + "num_tokens": 56694587.0, + "step": 1483 + }, + { + "epoch": 0.188780053428317, + "grad_norm": 1.6631988286972046, + "learning_rate": 6.286562102585841e-07, + "loss": 0.4545, + "mean_token_accuracy": 0.8440837860107422, + "num_tokens": 56727336.0, + "step": 1484 + }, + { + "epoch": 0.18890726370690752, + "grad_norm": 1.5081321001052856, + "learning_rate": 6.29080118694362e-07, + "loss": 0.5172, + "mean_token_accuracy": 0.8280854225158691, + "num_tokens": 56773772.0, + "step": 1485 + }, + { + "epoch": 0.18903447398549802, + "grad_norm": 1.5290863513946533, + "learning_rate": 6.295040271301398e-07, + "loss": 0.4433, + "mean_token_accuracy": 0.8492492437362671, + "num_tokens": 56815328.0, + "step": 1486 + }, + { + "epoch": 0.18916168426408853, + "grad_norm": 1.5310769081115723, + "learning_rate": 6.299279355659178e-07, + "loss": 0.3898, + "mean_token_accuracy": 0.8649027943611145, + "num_tokens": 56850819.0, + "step": 1487 + }, + { + "epoch": 0.18928889454267905, + "grad_norm": 1.560271143913269, + "learning_rate": 6.303518440016956e-07, + "loss": 0.4922, + "mean_token_accuracy": 0.8408320546150208, + "num_tokens": 56893096.0, + "step": 1488 + }, + { + "epoch": 0.18941610482126955, + "grad_norm": 1.6077549457550049, + "learning_rate": 6.307757524374735e-07, + "loss": 0.4756, + "mean_token_accuracy": 0.8434129357337952, + "num_tokens": 56927550.0, + "step": 1489 + }, + { + "epoch": 0.18954331509986008, + "grad_norm": 1.6378517150878906, + "learning_rate": 6.311996608732514e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.8385502099990845, + "num_tokens": 56966305.0, + "step": 1490 + }, + { + "epoch": 0.18967052537845058, + "grad_norm": 1.6567176580429077, + "learning_rate": 6.316235693090292e-07, + "loss": 0.4527, + "mean_token_accuracy": 0.8488665223121643, + "num_tokens": 57003654.0, + "step": 1491 + }, + { + "epoch": 0.18979773565704108, + "grad_norm": 1.4903135299682617, + "learning_rate": 6.320474777448071e-07, + "loss": 0.4379, + "mean_token_accuracy": 0.854316771030426, + "num_tokens": 57041592.0, + "step": 1492 + }, + { + "epoch": 0.1899249459356316, + "grad_norm": 1.5849350690841675, + "learning_rate": 6.324713861805849e-07, + "loss": 0.4186, + "mean_token_accuracy": 0.8585126399993896, + "num_tokens": 57077053.0, + "step": 1493 + }, + { + "epoch": 0.1900521562142221, + "grad_norm": 1.4939305782318115, + "learning_rate": 6.328952946163628e-07, + "loss": 0.4337, + "mean_token_accuracy": 0.8538963794708252, + "num_tokens": 57116713.0, + "step": 1494 + }, + { + "epoch": 0.1901793664928126, + "grad_norm": 1.5434751510620117, + "learning_rate": 6.333192030521407e-07, + "loss": 0.4414, + "mean_token_accuracy": 0.8526501655578613, + "num_tokens": 57158019.0, + "step": 1495 + }, + { + "epoch": 0.19030657677140314, + "grad_norm": 1.6276652812957764, + "learning_rate": 6.337431114879186e-07, + "loss": 0.4477, + "mean_token_accuracy": 0.8508636951446533, + "num_tokens": 57195911.0, + "step": 1496 + }, + { + "epoch": 0.19043378704999364, + "grad_norm": 1.713157296180725, + "learning_rate": 6.341670199236965e-07, + "loss": 0.5213, + "mean_token_accuracy": 0.830491840839386, + "num_tokens": 57233255.0, + "step": 1497 + }, + { + "epoch": 0.19056099732858414, + "grad_norm": 1.4643880128860474, + "learning_rate": 6.345909283594744e-07, + "loss": 0.3866, + "mean_token_accuracy": 0.8704535961151123, + "num_tokens": 57272890.0, + "step": 1498 + }, + { + "epoch": 0.19068820760717467, + "grad_norm": 1.5492076873779297, + "learning_rate": 6.350148367952522e-07, + "loss": 0.4589, + "mean_token_accuracy": 0.8480272889137268, + "num_tokens": 57312918.0, + "step": 1499 + }, + { + "epoch": 0.19081541788576517, + "grad_norm": 1.4989336729049683, + "learning_rate": 6.354387452310301e-07, + "loss": 0.4217, + "mean_token_accuracy": 0.863233208656311, + "num_tokens": 57350614.0, + "step": 1500 + }, + { + "epoch": 0.19094262816435567, + "grad_norm": 1.5260987281799316, + "learning_rate": 6.358626536668079e-07, + "loss": 0.4404, + "mean_token_accuracy": 0.8541334867477417, + "num_tokens": 57385336.0, + "step": 1501 + }, + { + "epoch": 0.1910698384429462, + "grad_norm": 1.4068398475646973, + "learning_rate": 6.362865621025858e-07, + "loss": 0.4093, + "mean_token_accuracy": 0.8624814748764038, + "num_tokens": 57424859.0, + "step": 1502 + }, + { + "epoch": 0.1911970487215367, + "grad_norm": 1.6484324932098389, + "learning_rate": 6.367104705383637e-07, + "loss": 0.4028, + "mean_token_accuracy": 0.866826593875885, + "num_tokens": 57458323.0, + "step": 1503 + }, + { + "epoch": 0.1913242590001272, + "grad_norm": 1.6787574291229248, + "learning_rate": 6.371343789741416e-07, + "loss": 0.4599, + "mean_token_accuracy": 0.8437128067016602, + "num_tokens": 57492610.0, + "step": 1504 + }, + { + "epoch": 0.19145146927871773, + "grad_norm": 1.369808316230774, + "learning_rate": 6.375582874099195e-07, + "loss": 0.4344, + "mean_token_accuracy": 0.8559137582778931, + "num_tokens": 57538444.0, + "step": 1505 + }, + { + "epoch": 0.19157867955730823, + "grad_norm": 1.713511347770691, + "learning_rate": 6.379821958456974e-07, + "loss": 0.5257, + "mean_token_accuracy": 0.8307819962501526, + "num_tokens": 57574866.0, + "step": 1506 + }, + { + "epoch": 0.19170588983589873, + "grad_norm": 1.5420386791229248, + "learning_rate": 6.384061042814751e-07, + "loss": 0.4716, + "mean_token_accuracy": 0.8418444395065308, + "num_tokens": 57616019.0, + "step": 1507 + }, + { + "epoch": 0.19183310011448926, + "grad_norm": 1.6201859712600708, + "learning_rate": 6.38830012717253e-07, + "loss": 0.4714, + "mean_token_accuracy": 0.8437696695327759, + "num_tokens": 57652901.0, + "step": 1508 + }, + { + "epoch": 0.19196031039307976, + "grad_norm": 1.8293319940567017, + "learning_rate": 6.392539211530309e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8396161198616028, + "num_tokens": 57683234.0, + "step": 1509 + }, + { + "epoch": 0.19208752067167026, + "grad_norm": 1.6309040784835815, + "learning_rate": 6.396778295888087e-07, + "loss": 0.4439, + "mean_token_accuracy": 0.8514913320541382, + "num_tokens": 57718967.0, + "step": 1510 + }, + { + "epoch": 0.1922147309502608, + "grad_norm": 1.583375096321106, + "learning_rate": 6.401017380245867e-07, + "loss": 0.4142, + "mean_token_accuracy": 0.8618699312210083, + "num_tokens": 57753415.0, + "step": 1511 + }, + { + "epoch": 0.1923419412288513, + "grad_norm": 1.5208075046539307, + "learning_rate": 6.405256464603645e-07, + "loss": 0.419, + "mean_token_accuracy": 0.8607713580131531, + "num_tokens": 57789325.0, + "step": 1512 + }, + { + "epoch": 0.1924691515074418, + "grad_norm": 1.616262674331665, + "learning_rate": 6.409495548961425e-07, + "loss": 0.4094, + "mean_token_accuracy": 0.8624426126480103, + "num_tokens": 57822235.0, + "step": 1513 + }, + { + "epoch": 0.19259636178603232, + "grad_norm": 1.5085012912750244, + "learning_rate": 6.413734633319203e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.8407542705535889, + "num_tokens": 57863684.0, + "step": 1514 + }, + { + "epoch": 0.19272357206462282, + "grad_norm": 1.7306352853775024, + "learning_rate": 6.417973717676981e-07, + "loss": 0.4106, + "mean_token_accuracy": 0.8610267639160156, + "num_tokens": 57897001.0, + "step": 1515 + }, + { + "epoch": 0.19285078234321335, + "grad_norm": 1.6439129114151, + "learning_rate": 6.42221280203476e-07, + "loss": 0.4394, + "mean_token_accuracy": 0.8526263236999512, + "num_tokens": 57932597.0, + "step": 1516 + }, + { + "epoch": 0.19297799262180385, + "grad_norm": 1.451507568359375, + "learning_rate": 6.426451886392539e-07, + "loss": 0.4023, + "mean_token_accuracy": 0.8647270798683167, + "num_tokens": 57976576.0, + "step": 1517 + }, + { + "epoch": 0.19310520290039435, + "grad_norm": 1.6398367881774902, + "learning_rate": 6.430690970750317e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8383157253265381, + "num_tokens": 58015288.0, + "step": 1518 + }, + { + "epoch": 0.19323241317898487, + "grad_norm": 1.525803565979004, + "learning_rate": 6.434930055108097e-07, + "loss": 0.4497, + "mean_token_accuracy": 0.8489656448364258, + "num_tokens": 58055536.0, + "step": 1519 + }, + { + "epoch": 0.19335962345757537, + "grad_norm": 1.5463038682937622, + "learning_rate": 6.439169139465875e-07, + "loss": 0.4699, + "mean_token_accuracy": 0.8438527584075928, + "num_tokens": 58096534.0, + "step": 1520 + }, + { + "epoch": 0.19348683373616588, + "grad_norm": 1.6188910007476807, + "learning_rate": 6.443408223823655e-07, + "loss": 0.4574, + "mean_token_accuracy": 0.84502112865448, + "num_tokens": 58134205.0, + "step": 1521 + }, + { + "epoch": 0.1936140440147564, + "grad_norm": 1.4892048835754395, + "learning_rate": 6.447647308181432e-07, + "loss": 0.442, + "mean_token_accuracy": 0.8531007766723633, + "num_tokens": 58173489.0, + "step": 1522 + }, + { + "epoch": 0.1937412542933469, + "grad_norm": 1.5687178373336792, + "learning_rate": 6.451886392539211e-07, + "loss": 0.436, + "mean_token_accuracy": 0.8588458299636841, + "num_tokens": 58212386.0, + "step": 1523 + }, + { + "epoch": 0.1938684645719374, + "grad_norm": 1.5384632349014282, + "learning_rate": 6.45612547689699e-07, + "loss": 0.4676, + "mean_token_accuracy": 0.8447690606117249, + "num_tokens": 58254564.0, + "step": 1524 + }, + { + "epoch": 0.19399567485052793, + "grad_norm": 1.6793932914733887, + "learning_rate": 6.460364561254769e-07, + "loss": 0.4188, + "mean_token_accuracy": 0.8581523299217224, + "num_tokens": 58289217.0, + "step": 1525 + }, + { + "epoch": 0.19412288512911843, + "grad_norm": 1.6088249683380127, + "learning_rate": 6.464603645612547e-07, + "loss": 0.4432, + "mean_token_accuracy": 0.8533636331558228, + "num_tokens": 58324329.0, + "step": 1526 + }, + { + "epoch": 0.19425009540770893, + "grad_norm": 1.6129399538040161, + "learning_rate": 6.468842729970327e-07, + "loss": 0.4466, + "mean_token_accuracy": 0.8544535636901855, + "num_tokens": 58360319.0, + "step": 1527 + }, + { + "epoch": 0.19437730568629946, + "grad_norm": 1.539291501045227, + "learning_rate": 6.473081814328105e-07, + "loss": 0.4535, + "mean_token_accuracy": 0.8471705317497253, + "num_tokens": 58396850.0, + "step": 1528 + }, + { + "epoch": 0.19450451596488996, + "grad_norm": 1.4437403678894043, + "learning_rate": 6.477320898685885e-07, + "loss": 0.416, + "mean_token_accuracy": 0.8605366349220276, + "num_tokens": 58439151.0, + "step": 1529 + }, + { + "epoch": 0.19463172624348046, + "grad_norm": 1.5451157093048096, + "learning_rate": 6.481559983043662e-07, + "loss": 0.4388, + "mean_token_accuracy": 0.8503454327583313, + "num_tokens": 58479074.0, + "step": 1530 + }, + { + "epoch": 0.194758936522071, + "grad_norm": 1.4226646423339844, + "learning_rate": 6.48579906740144e-07, + "loss": 0.3827, + "mean_token_accuracy": 0.8673115968704224, + "num_tokens": 58520216.0, + "step": 1531 + }, + { + "epoch": 0.1948861468006615, + "grad_norm": 1.6773953437805176, + "learning_rate": 6.49003815175922e-07, + "loss": 0.4521, + "mean_token_accuracy": 0.8474107980728149, + "num_tokens": 58550934.0, + "step": 1532 + }, + { + "epoch": 0.195013357079252, + "grad_norm": 1.5897953510284424, + "learning_rate": 6.494277236116998e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.8386070728302002, + "num_tokens": 58589028.0, + "step": 1533 + }, + { + "epoch": 0.19514056735784252, + "grad_norm": 1.4100513458251953, + "learning_rate": 6.498516320474777e-07, + "loss": 0.4174, + "mean_token_accuracy": 0.8619370460510254, + "num_tokens": 58634491.0, + "step": 1534 + }, + { + "epoch": 0.19526777763643302, + "grad_norm": 1.483512043952942, + "learning_rate": 6.502755404832556e-07, + "loss": 0.3915, + "mean_token_accuracy": 0.8711715340614319, + "num_tokens": 58671687.0, + "step": 1535 + }, + { + "epoch": 0.19539498791502352, + "grad_norm": 1.5378267765045166, + "learning_rate": 6.506994489190335e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.8586076498031616, + "num_tokens": 58710509.0, + "step": 1536 + }, + { + "epoch": 0.19552219819361405, + "grad_norm": 1.5867390632629395, + "learning_rate": 6.511233573548114e-07, + "loss": 0.4653, + "mean_token_accuracy": 0.8512930274009705, + "num_tokens": 58745860.0, + "step": 1537 + }, + { + "epoch": 0.19564940847220455, + "grad_norm": 1.4877923727035522, + "learning_rate": 6.515472657905892e-07, + "loss": 0.4161, + "mean_token_accuracy": 0.8586901426315308, + "num_tokens": 58785908.0, + "step": 1538 + }, + { + "epoch": 0.19577661875079505, + "grad_norm": 1.578425407409668, + "learning_rate": 6.51971174226367e-07, + "loss": 0.4176, + "mean_token_accuracy": 0.8574588298797607, + "num_tokens": 58823906.0, + "step": 1539 + }, + { + "epoch": 0.19590382902938558, + "grad_norm": 1.6167117357254028, + "learning_rate": 6.52395082662145e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.8377147912979126, + "num_tokens": 58862998.0, + "step": 1540 + }, + { + "epoch": 0.19603103930797608, + "grad_norm": 1.7830430269241333, + "learning_rate": 6.528189910979228e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8455276489257812, + "num_tokens": 58896544.0, + "step": 1541 + }, + { + "epoch": 0.1961582495865666, + "grad_norm": 1.5411323308944702, + "learning_rate": 6.532428995337007e-07, + "loss": 0.4421, + "mean_token_accuracy": 0.8500802516937256, + "num_tokens": 58934410.0, + "step": 1542 + }, + { + "epoch": 0.1962854598651571, + "grad_norm": 1.7853126525878906, + "learning_rate": 6.536668079694786e-07, + "loss": 0.4248, + "mean_token_accuracy": 0.8571996092796326, + "num_tokens": 58967330.0, + "step": 1543 + }, + { + "epoch": 0.1964126701437476, + "grad_norm": 1.581379771232605, + "learning_rate": 6.540907164052565e-07, + "loss": 0.4196, + "mean_token_accuracy": 0.8615520000457764, + "num_tokens": 59007134.0, + "step": 1544 + }, + { + "epoch": 0.19653988042233814, + "grad_norm": 1.6269431114196777, + "learning_rate": 6.545146248410343e-07, + "loss": 0.4434, + "mean_token_accuracy": 0.8535298109054565, + "num_tokens": 59044418.0, + "step": 1545 + }, + { + "epoch": 0.19666709070092864, + "grad_norm": 1.5518766641616821, + "learning_rate": 6.549385332768122e-07, + "loss": 0.3996, + "mean_token_accuracy": 0.8652617931365967, + "num_tokens": 59081444.0, + "step": 1546 + }, + { + "epoch": 0.19679430097951914, + "grad_norm": 1.496422290802002, + "learning_rate": 6.5536244171259e-07, + "loss": 0.4172, + "mean_token_accuracy": 0.8636221885681152, + "num_tokens": 59120955.0, + "step": 1547 + }, + { + "epoch": 0.19692151125810967, + "grad_norm": 1.6052719354629517, + "learning_rate": 6.55786350148368e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.8375105857849121, + "num_tokens": 59159287.0, + "step": 1548 + }, + { + "epoch": 0.19704872153670017, + "grad_norm": 1.5872186422348022, + "learning_rate": 6.562102585841458e-07, + "loss": 0.4417, + "mean_token_accuracy": 0.8531122803688049, + "num_tokens": 59197134.0, + "step": 1549 + }, + { + "epoch": 0.19717593181529067, + "grad_norm": 1.516135811805725, + "learning_rate": 6.566341670199236e-07, + "loss": 0.3995, + "mean_token_accuracy": 0.8631990551948547, + "num_tokens": 59231269.0, + "step": 1550 + }, + { + "epoch": 0.1973031420938812, + "grad_norm": 1.6481105089187622, + "learning_rate": 6.570580754557016e-07, + "loss": 0.4497, + "mean_token_accuracy": 0.8475560545921326, + "num_tokens": 59268311.0, + "step": 1551 + }, + { + "epoch": 0.1974303523724717, + "grad_norm": 1.6293106079101562, + "learning_rate": 6.574819838914794e-07, + "loss": 0.4612, + "mean_token_accuracy": 0.8461471796035767, + "num_tokens": 59303788.0, + "step": 1552 + }, + { + "epoch": 0.1975575626510622, + "grad_norm": 1.4758304357528687, + "learning_rate": 6.579058923272573e-07, + "loss": 0.501, + "mean_token_accuracy": 0.8394972085952759, + "num_tokens": 59346425.0, + "step": 1553 + }, + { + "epoch": 0.19768477292965272, + "grad_norm": 1.7297780513763428, + "learning_rate": 6.583298007630351e-07, + "loss": 0.4489, + "mean_token_accuracy": 0.8508107662200928, + "num_tokens": 59379296.0, + "step": 1554 + }, + { + "epoch": 0.19781198320824323, + "grad_norm": 1.513956904411316, + "learning_rate": 6.58753709198813e-07, + "loss": 0.4238, + "mean_token_accuracy": 0.8590049743652344, + "num_tokens": 59416891.0, + "step": 1555 + }, + { + "epoch": 0.19793919348683373, + "grad_norm": 1.5389996767044067, + "learning_rate": 6.591776176345909e-07, + "loss": 0.4068, + "mean_token_accuracy": 0.8613566756248474, + "num_tokens": 59452426.0, + "step": 1556 + }, + { + "epoch": 0.19806640376542425, + "grad_norm": 1.7063621282577515, + "learning_rate": 6.596015260703688e-07, + "loss": 0.422, + "mean_token_accuracy": 0.8602612018585205, + "num_tokens": 59486563.0, + "step": 1557 + }, + { + "epoch": 0.19819361404401475, + "grad_norm": 1.5630972385406494, + "learning_rate": 6.600254345061466e-07, + "loss": 0.4219, + "mean_token_accuracy": 0.8564664125442505, + "num_tokens": 59521695.0, + "step": 1558 + }, + { + "epoch": 0.19832082432260525, + "grad_norm": 1.5783973932266235, + "learning_rate": 6.604493429419246e-07, + "loss": 0.5088, + "mean_token_accuracy": 0.8308421969413757, + "num_tokens": 59563894.0, + "step": 1559 + }, + { + "epoch": 0.19844803460119578, + "grad_norm": 1.5457087755203247, + "learning_rate": 6.608732513777023e-07, + "loss": 0.4344, + "mean_token_accuracy": 0.8508031368255615, + "num_tokens": 59601623.0, + "step": 1560 + }, + { + "epoch": 0.19857524487978628, + "grad_norm": 1.6132866144180298, + "learning_rate": 6.612971598134803e-07, + "loss": 0.4195, + "mean_token_accuracy": 0.8619017004966736, + "num_tokens": 59636323.0, + "step": 1561 + }, + { + "epoch": 0.19870245515837678, + "grad_norm": 1.6331231594085693, + "learning_rate": 6.617210682492581e-07, + "loss": 0.4499, + "mean_token_accuracy": 0.8505088090896606, + "num_tokens": 59674449.0, + "step": 1562 + }, + { + "epoch": 0.1988296654369673, + "grad_norm": 1.5936260223388672, + "learning_rate": 6.62144976685036e-07, + "loss": 0.4602, + "mean_token_accuracy": 0.8464016914367676, + "num_tokens": 59716708.0, + "step": 1563 + }, + { + "epoch": 0.1989568757155578, + "grad_norm": 1.589063048362732, + "learning_rate": 6.625688851208139e-07, + "loss": 0.442, + "mean_token_accuracy": 0.8520635366439819, + "num_tokens": 59752578.0, + "step": 1564 + }, + { + "epoch": 0.19908408599414834, + "grad_norm": 1.6410281658172607, + "learning_rate": 6.629927935565918e-07, + "loss": 0.4128, + "mean_token_accuracy": 0.8594765067100525, + "num_tokens": 59787916.0, + "step": 1565 + }, + { + "epoch": 0.19921129627273884, + "grad_norm": 1.719968318939209, + "learning_rate": 6.634167019923696e-07, + "loss": 0.4224, + "mean_token_accuracy": 0.8576964139938354, + "num_tokens": 59824122.0, + "step": 1566 + }, + { + "epoch": 0.19933850655132934, + "grad_norm": 1.4808011054992676, + "learning_rate": 6.638406104281476e-07, + "loss": 0.4265, + "mean_token_accuracy": 0.8591973781585693, + "num_tokens": 59862782.0, + "step": 1567 + }, + { + "epoch": 0.19946571682991987, + "grad_norm": 1.4964876174926758, + "learning_rate": 6.642645188639253e-07, + "loss": 0.445, + "mean_token_accuracy": 0.8534276485443115, + "num_tokens": 59902163.0, + "step": 1568 + }, + { + "epoch": 0.19959292710851037, + "grad_norm": 1.7320191860198975, + "learning_rate": 6.646884272997032e-07, + "loss": 0.4665, + "mean_token_accuracy": 0.8448149561882019, + "num_tokens": 59937881.0, + "step": 1569 + }, + { + "epoch": 0.19972013738710087, + "grad_norm": 1.6396807432174683, + "learning_rate": 6.651123357354811e-07, + "loss": 0.4684, + "mean_token_accuracy": 0.84549880027771, + "num_tokens": 59975190.0, + "step": 1570 + }, + { + "epoch": 0.1998473476656914, + "grad_norm": 1.4734119176864624, + "learning_rate": 6.655362441712589e-07, + "loss": 0.4504, + "mean_token_accuracy": 0.8486403822898865, + "num_tokens": 60020177.0, + "step": 1571 + }, + { + "epoch": 0.1999745579442819, + "grad_norm": 1.6082669496536255, + "learning_rate": 6.659601526070369e-07, + "loss": 0.428, + "mean_token_accuracy": 0.8553798198699951, + "num_tokens": 60054290.0, + "step": 1572 + }, + { + "epoch": 0.2001017682228724, + "grad_norm": 1.6380025148391724, + "learning_rate": 6.663840610428147e-07, + "loss": 0.4125, + "mean_token_accuracy": 0.862796425819397, + "num_tokens": 60087330.0, + "step": 1573 + }, + { + "epoch": 0.20022897850146293, + "grad_norm": 1.6112626791000366, + "learning_rate": 6.668079694785926e-07, + "loss": 0.4051, + "mean_token_accuracy": 0.8635624051094055, + "num_tokens": 60124773.0, + "step": 1574 + }, + { + "epoch": 0.20035618878005343, + "grad_norm": 1.5368553400039673, + "learning_rate": 6.672318779143704e-07, + "loss": 0.4165, + "mean_token_accuracy": 0.860831081867218, + "num_tokens": 60162021.0, + "step": 1575 + }, + { + "epoch": 0.20048339905864393, + "grad_norm": 1.5715131759643555, + "learning_rate": 6.676557863501483e-07, + "loss": 0.444, + "mean_token_accuracy": 0.8522090911865234, + "num_tokens": 60200660.0, + "step": 1576 + }, + { + "epoch": 0.20061060933723446, + "grad_norm": 1.599252462387085, + "learning_rate": 6.680796947859262e-07, + "loss": 0.4272, + "mean_token_accuracy": 0.8572039604187012, + "num_tokens": 60235622.0, + "step": 1577 + }, + { + "epoch": 0.20073781961582496, + "grad_norm": 1.500410556793213, + "learning_rate": 6.685036032217041e-07, + "loss": 0.4832, + "mean_token_accuracy": 0.8430238962173462, + "num_tokens": 60278063.0, + "step": 1578 + }, + { + "epoch": 0.20086502989441546, + "grad_norm": 1.5103561878204346, + "learning_rate": 6.689275116574819e-07, + "loss": 0.3828, + "mean_token_accuracy": 0.8684382438659668, + "num_tokens": 60311374.0, + "step": 1579 + }, + { + "epoch": 0.200992240173006, + "grad_norm": 1.5953508615493774, + "learning_rate": 6.693514200932599e-07, + "loss": 0.5196, + "mean_token_accuracy": 0.830873966217041, + "num_tokens": 60354011.0, + "step": 1580 + }, + { + "epoch": 0.2011194504515965, + "grad_norm": 1.573357105255127, + "learning_rate": 6.697753285290377e-07, + "loss": 0.4856, + "mean_token_accuracy": 0.8425545692443848, + "num_tokens": 60395280.0, + "step": 1581 + }, + { + "epoch": 0.201246660730187, + "grad_norm": 1.6791924238204956, + "learning_rate": 6.701992369648156e-07, + "loss": 0.4232, + "mean_token_accuracy": 0.8554626703262329, + "num_tokens": 60427641.0, + "step": 1582 + }, + { + "epoch": 0.20137387100877752, + "grad_norm": 1.573931097984314, + "learning_rate": 6.706231454005934e-07, + "loss": 0.4376, + "mean_token_accuracy": 0.8553842306137085, + "num_tokens": 60463083.0, + "step": 1583 + }, + { + "epoch": 0.20150108128736802, + "grad_norm": 1.553088903427124, + "learning_rate": 6.710470538363713e-07, + "loss": 0.4483, + "mean_token_accuracy": 0.8502441048622131, + "num_tokens": 60500710.0, + "step": 1584 + }, + { + "epoch": 0.20162829156595852, + "grad_norm": 1.5455127954483032, + "learning_rate": 6.714709622721492e-07, + "loss": 0.43, + "mean_token_accuracy": 0.8579654693603516, + "num_tokens": 60538597.0, + "step": 1585 + }, + { + "epoch": 0.20175550184454905, + "grad_norm": 1.592297911643982, + "learning_rate": 6.718948707079271e-07, + "loss": 0.4294, + "mean_token_accuracy": 0.8562771081924438, + "num_tokens": 60571706.0, + "step": 1586 + }, + { + "epoch": 0.20188271212313955, + "grad_norm": 1.624958872795105, + "learning_rate": 6.723187791437049e-07, + "loss": 0.5031, + "mean_token_accuracy": 0.8344503045082092, + "num_tokens": 60611648.0, + "step": 1587 + }, + { + "epoch": 0.20200992240173005, + "grad_norm": 1.5829195976257324, + "learning_rate": 6.727426875794829e-07, + "loss": 0.4591, + "mean_token_accuracy": 0.8493173122406006, + "num_tokens": 60647642.0, + "step": 1588 + }, + { + "epoch": 0.20213713268032057, + "grad_norm": 1.5664701461791992, + "learning_rate": 6.731665960152607e-07, + "loss": 0.4336, + "mean_token_accuracy": 0.8546769618988037, + "num_tokens": 60684790.0, + "step": 1589 + }, + { + "epoch": 0.20226434295891108, + "grad_norm": 1.459172010421753, + "learning_rate": 6.735905044510385e-07, + "loss": 0.464, + "mean_token_accuracy": 0.8453449606895447, + "num_tokens": 60725671.0, + "step": 1590 + }, + { + "epoch": 0.2023915532375016, + "grad_norm": 1.5807734727859497, + "learning_rate": 6.740144128868164e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.8455015420913696, + "num_tokens": 60761537.0, + "step": 1591 + }, + { + "epoch": 0.2025187635160921, + "grad_norm": 1.5156981945037842, + "learning_rate": 6.744383213225942e-07, + "loss": 0.4878, + "mean_token_accuracy": 0.8405625820159912, + "num_tokens": 60802055.0, + "step": 1592 + }, + { + "epoch": 0.2026459737946826, + "grad_norm": 1.569175362586975, + "learning_rate": 6.748622297583722e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8450378179550171, + "num_tokens": 60844131.0, + "step": 1593 + }, + { + "epoch": 0.20277318407327313, + "grad_norm": 1.531563639640808, + "learning_rate": 6.7528613819415e-07, + "loss": 0.5096, + "mean_token_accuracy": 0.8322337865829468, + "num_tokens": 60885645.0, + "step": 1594 + }, + { + "epoch": 0.20290039435186363, + "grad_norm": 1.5191885232925415, + "learning_rate": 6.757100466299279e-07, + "loss": 0.4491, + "mean_token_accuracy": 0.8493573665618896, + "num_tokens": 60925174.0, + "step": 1595 + }, + { + "epoch": 0.20302760463045413, + "grad_norm": 1.5076630115509033, + "learning_rate": 6.761339550657058e-07, + "loss": 0.4047, + "mean_token_accuracy": 0.8656209707260132, + "num_tokens": 60963190.0, + "step": 1596 + }, + { + "epoch": 0.20315481490904466, + "grad_norm": 1.5601216554641724, + "learning_rate": 6.765578635014837e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.8415482044219971, + "num_tokens": 61004892.0, + "step": 1597 + }, + { + "epoch": 0.20328202518763516, + "grad_norm": 1.5857033729553223, + "learning_rate": 6.769817719372614e-07, + "loss": 0.4508, + "mean_token_accuracy": 0.8480569124221802, + "num_tokens": 61044217.0, + "step": 1598 + }, + { + "epoch": 0.20340923546622566, + "grad_norm": 1.6642718315124512, + "learning_rate": 6.774056803730394e-07, + "loss": 0.4451, + "mean_token_accuracy": 0.8491090536117554, + "num_tokens": 61077444.0, + "step": 1599 + }, + { + "epoch": 0.2035364457448162, + "grad_norm": 1.5161464214324951, + "learning_rate": 6.778295888088172e-07, + "loss": 0.4844, + "mean_token_accuracy": 0.84018474817276, + "num_tokens": 61117510.0, + "step": 1600 + }, + { + "epoch": 0.2036636560234067, + "grad_norm": 1.487850546836853, + "learning_rate": 6.782534972445952e-07, + "loss": 0.4586, + "mean_token_accuracy": 0.850654125213623, + "num_tokens": 61158532.0, + "step": 1601 + }, + { + "epoch": 0.2037908663019972, + "grad_norm": 1.4713923931121826, + "learning_rate": 6.78677405680373e-07, + "loss": 0.411, + "mean_token_accuracy": 0.8632513284683228, + "num_tokens": 61194406.0, + "step": 1602 + }, + { + "epoch": 0.20391807658058772, + "grad_norm": 1.6891263723373413, + "learning_rate": 6.791013141161509e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.8412708044052124, + "num_tokens": 61230323.0, + "step": 1603 + }, + { + "epoch": 0.20404528685917822, + "grad_norm": 1.5452827215194702, + "learning_rate": 6.795252225519288e-07, + "loss": 0.4546, + "mean_token_accuracy": 0.8441072702407837, + "num_tokens": 61268273.0, + "step": 1604 + }, + { + "epoch": 0.20417249713776872, + "grad_norm": 1.5278548002243042, + "learning_rate": 6.799491309877067e-07, + "loss": 0.4226, + "mean_token_accuracy": 0.859666109085083, + "num_tokens": 61304719.0, + "step": 1605 + }, + { + "epoch": 0.20429970741635925, + "grad_norm": 1.6232166290283203, + "learning_rate": 6.803730394234844e-07, + "loss": 0.451, + "mean_token_accuracy": 0.8507916927337646, + "num_tokens": 61343905.0, + "step": 1606 + }, + { + "epoch": 0.20442691769494975, + "grad_norm": 1.609946846961975, + "learning_rate": 6.807969478592624e-07, + "loss": 0.4692, + "mean_token_accuracy": 0.8451138138771057, + "num_tokens": 61382846.0, + "step": 1607 + }, + { + "epoch": 0.20455412797354025, + "grad_norm": 1.5769394636154175, + "learning_rate": 6.812208562950402e-07, + "loss": 0.4783, + "mean_token_accuracy": 0.8455846905708313, + "num_tokens": 61420388.0, + "step": 1608 + }, + { + "epoch": 0.20468133825213078, + "grad_norm": 1.4596821069717407, + "learning_rate": 6.816447647308182e-07, + "loss": 0.4425, + "mean_token_accuracy": 0.8525742292404175, + "num_tokens": 61462023.0, + "step": 1609 + }, + { + "epoch": 0.20480854853072128, + "grad_norm": 1.5739372968673706, + "learning_rate": 6.82068673166596e-07, + "loss": 0.4676, + "mean_token_accuracy": 0.8469229936599731, + "num_tokens": 61500538.0, + "step": 1610 + }, + { + "epoch": 0.20493575880931178, + "grad_norm": 1.6060229539871216, + "learning_rate": 6.824925816023738e-07, + "loss": 0.4998, + "mean_token_accuracy": 0.8317228555679321, + "num_tokens": 61540304.0, + "step": 1611 + }, + { + "epoch": 0.2050629690879023, + "grad_norm": 1.6592432260513306, + "learning_rate": 6.829164900381518e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.836219847202301, + "num_tokens": 61576973.0, + "step": 1612 + }, + { + "epoch": 0.2051901793664928, + "grad_norm": 1.5082826614379883, + "learning_rate": 6.833403984739295e-07, + "loss": 0.4115, + "mean_token_accuracy": 0.8618494272232056, + "num_tokens": 61612464.0, + "step": 1613 + }, + { + "epoch": 0.2053173896450833, + "grad_norm": 1.567575454711914, + "learning_rate": 6.837643069097074e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8410369157791138, + "num_tokens": 61650815.0, + "step": 1614 + }, + { + "epoch": 0.20544459992367384, + "grad_norm": 1.6470675468444824, + "learning_rate": 6.841882153454853e-07, + "loss": 0.4424, + "mean_token_accuracy": 0.8529865741729736, + "num_tokens": 61685844.0, + "step": 1615 + }, + { + "epoch": 0.20557181020226434, + "grad_norm": 1.5557739734649658, + "learning_rate": 6.846121237812632e-07, + "loss": 0.4062, + "mean_token_accuracy": 0.8672097325325012, + "num_tokens": 61722198.0, + "step": 1616 + }, + { + "epoch": 0.20569902048085487, + "grad_norm": 1.408379316329956, + "learning_rate": 6.850360322170411e-07, + "loss": 0.3821, + "mean_token_accuracy": 0.8705434203147888, + "num_tokens": 61764075.0, + "step": 1617 + }, + { + "epoch": 0.20582623075944537, + "grad_norm": 1.7008777856826782, + "learning_rate": 6.85459940652819e-07, + "loss": 0.4208, + "mean_token_accuracy": 0.8596378564834595, + "num_tokens": 61794384.0, + "step": 1618 + }, + { + "epoch": 0.20595344103803587, + "grad_norm": 1.3969734907150269, + "learning_rate": 6.858838490885968e-07, + "loss": 0.4291, + "mean_token_accuracy": 0.8586304187774658, + "num_tokens": 61837218.0, + "step": 1619 + }, + { + "epoch": 0.2060806513166264, + "grad_norm": 1.4540072679519653, + "learning_rate": 6.863077575243748e-07, + "loss": 0.4198, + "mean_token_accuracy": 0.8597941398620605, + "num_tokens": 61879564.0, + "step": 1620 + }, + { + "epoch": 0.2062078615952169, + "grad_norm": 1.6071288585662842, + "learning_rate": 6.867316659601525e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8439561724662781, + "num_tokens": 61918089.0, + "step": 1621 + }, + { + "epoch": 0.2063350718738074, + "grad_norm": 1.7768025398254395, + "learning_rate": 6.871555743959304e-07, + "loss": 0.4496, + "mean_token_accuracy": 0.8487530946731567, + "num_tokens": 61948428.0, + "step": 1622 + }, + { + "epoch": 0.20646228215239792, + "grad_norm": 1.5069559812545776, + "learning_rate": 6.875794828317083e-07, + "loss": 0.4636, + "mean_token_accuracy": 0.8503173589706421, + "num_tokens": 61991282.0, + "step": 1623 + }, + { + "epoch": 0.20658949243098843, + "grad_norm": 1.61016047000885, + "learning_rate": 6.880033912674862e-07, + "loss": 0.4648, + "mean_token_accuracy": 0.8477647304534912, + "num_tokens": 62028499.0, + "step": 1624 + }, + { + "epoch": 0.20671670270957893, + "grad_norm": 1.4268512725830078, + "learning_rate": 6.884272997032641e-07, + "loss": 0.4082, + "mean_token_accuracy": 0.8609926700592041, + "num_tokens": 62070495.0, + "step": 1625 + }, + { + "epoch": 0.20684391298816945, + "grad_norm": 1.445120096206665, + "learning_rate": 6.88851208139042e-07, + "loss": 0.4195, + "mean_token_accuracy": 0.8627455830574036, + "num_tokens": 62111122.0, + "step": 1626 + }, + { + "epoch": 0.20697112326675995, + "grad_norm": 1.6461067199707031, + "learning_rate": 6.892751165748198e-07, + "loss": 0.4501, + "mean_token_accuracy": 0.8508788347244263, + "num_tokens": 62143940.0, + "step": 1627 + }, + { + "epoch": 0.20709833354535045, + "grad_norm": 1.466040849685669, + "learning_rate": 6.896990250105978e-07, + "loss": 0.4221, + "mean_token_accuracy": 0.8615082502365112, + "num_tokens": 62182084.0, + "step": 1628 + }, + { + "epoch": 0.20722554382394098, + "grad_norm": 1.5236018896102905, + "learning_rate": 6.901229334463755e-07, + "loss": 0.4112, + "mean_token_accuracy": 0.8606311082839966, + "num_tokens": 62222283.0, + "step": 1629 + }, + { + "epoch": 0.20735275410253148, + "grad_norm": 1.76255464553833, + "learning_rate": 6.905468418821534e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.8425326943397522, + "num_tokens": 62254190.0, + "step": 1630 + }, + { + "epoch": 0.20747996438112198, + "grad_norm": 1.7482908964157104, + "learning_rate": 6.909707503179313e-07, + "loss": 0.4121, + "mean_token_accuracy": 0.8596667051315308, + "num_tokens": 62284846.0, + "step": 1631 + }, + { + "epoch": 0.2076071746597125, + "grad_norm": 1.683447003364563, + "learning_rate": 6.913946587537091e-07, + "loss": 0.4539, + "mean_token_accuracy": 0.8502910137176514, + "num_tokens": 62318505.0, + "step": 1632 + }, + { + "epoch": 0.207734384938303, + "grad_norm": 1.5735875368118286, + "learning_rate": 6.918185671894871e-07, + "loss": 0.4391, + "mean_token_accuracy": 0.8556108474731445, + "num_tokens": 62356074.0, + "step": 1633 + }, + { + "epoch": 0.2078615952168935, + "grad_norm": 1.7030552625656128, + "learning_rate": 6.922424756252649e-07, + "loss": 0.4157, + "mean_token_accuracy": 0.8544017672538757, + "num_tokens": 62390272.0, + "step": 1634 + }, + { + "epoch": 0.20798880549548404, + "grad_norm": 1.8199145793914795, + "learning_rate": 6.926663840610428e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.8365809917449951, + "num_tokens": 62420769.0, + "step": 1635 + }, + { + "epoch": 0.20811601577407454, + "grad_norm": 1.5912516117095947, + "learning_rate": 6.930902924968206e-07, + "loss": 0.4144, + "mean_token_accuracy": 0.862499475479126, + "num_tokens": 62456147.0, + "step": 1636 + }, + { + "epoch": 0.20824322605266504, + "grad_norm": 1.5381922721862793, + "learning_rate": 6.935142009325985e-07, + "loss": 0.3646, + "mean_token_accuracy": 0.8744530081748962, + "num_tokens": 62490722.0, + "step": 1637 + }, + { + "epoch": 0.20837043633125557, + "grad_norm": 1.7106504440307617, + "learning_rate": 6.939381093683764e-07, + "loss": 0.3797, + "mean_token_accuracy": 0.8691737055778503, + "num_tokens": 62525803.0, + "step": 1638 + }, + { + "epoch": 0.20849764660984607, + "grad_norm": 1.5140498876571655, + "learning_rate": 6.943620178041543e-07, + "loss": 0.4722, + "mean_token_accuracy": 0.8440929651260376, + "num_tokens": 62569531.0, + "step": 1639 + }, + { + "epoch": 0.2086248568884366, + "grad_norm": 1.4889003038406372, + "learning_rate": 6.947859262399321e-07, + "loss": 0.4668, + "mean_token_accuracy": 0.8547184467315674, + "num_tokens": 62612573.0, + "step": 1640 + }, + { + "epoch": 0.2087520671670271, + "grad_norm": 1.5103564262390137, + "learning_rate": 6.952098346757101e-07, + "loss": 0.4434, + "mean_token_accuracy": 0.852777361869812, + "num_tokens": 62656060.0, + "step": 1641 + }, + { + "epoch": 0.2088792774456176, + "grad_norm": 1.6208546161651611, + "learning_rate": 6.956337431114879e-07, + "loss": 0.3719, + "mean_token_accuracy": 0.8712360858917236, + "num_tokens": 62689074.0, + "step": 1642 + }, + { + "epoch": 0.20900648772420813, + "grad_norm": 1.565540075302124, + "learning_rate": 6.960576515472658e-07, + "loss": 0.4321, + "mean_token_accuracy": 0.8531168699264526, + "num_tokens": 62728451.0, + "step": 1643 + }, + { + "epoch": 0.20913369800279863, + "grad_norm": 1.6231640577316284, + "learning_rate": 6.964815599830436e-07, + "loss": 0.4593, + "mean_token_accuracy": 0.8480716943740845, + "num_tokens": 62767470.0, + "step": 1644 + }, + { + "epoch": 0.20926090828138913, + "grad_norm": 1.5791529417037964, + "learning_rate": 6.969054684188215e-07, + "loss": 0.4393, + "mean_token_accuracy": 0.8542821407318115, + "num_tokens": 62804459.0, + "step": 1645 + }, + { + "epoch": 0.20938811855997966, + "grad_norm": 1.5317643880844116, + "learning_rate": 6.973293768545994e-07, + "loss": 0.4134, + "mean_token_accuracy": 0.8611913323402405, + "num_tokens": 62840042.0, + "step": 1646 + }, + { + "epoch": 0.20951532883857016, + "grad_norm": 1.559597373008728, + "learning_rate": 6.977532852903773e-07, + "loss": 0.4386, + "mean_token_accuracy": 0.8560259342193604, + "num_tokens": 62877999.0, + "step": 1647 + }, + { + "epoch": 0.20964253911716066, + "grad_norm": 1.7504351139068604, + "learning_rate": 6.981771937261551e-07, + "loss": 0.4968, + "mean_token_accuracy": 0.8345558643341064, + "num_tokens": 62908411.0, + "step": 1648 + }, + { + "epoch": 0.2097697493957512, + "grad_norm": 1.6351804733276367, + "learning_rate": 6.986011021619331e-07, + "loss": 0.4593, + "mean_token_accuracy": 0.8489354848861694, + "num_tokens": 62945332.0, + "step": 1649 + }, + { + "epoch": 0.2098969596743417, + "grad_norm": 1.6330574750900269, + "learning_rate": 6.990250105977109e-07, + "loss": 0.408, + "mean_token_accuracy": 0.8624980449676514, + "num_tokens": 62980118.0, + "step": 1650 + }, + { + "epoch": 0.2100241699529322, + "grad_norm": 1.5964429378509521, + "learning_rate": 6.994489190334886e-07, + "loss": 0.4009, + "mean_token_accuracy": 0.8686851263046265, + "num_tokens": 63016268.0, + "step": 1651 + }, + { + "epoch": 0.21015138023152272, + "grad_norm": 1.5667694807052612, + "learning_rate": 6.998728274692666e-07, + "loss": 0.454, + "mean_token_accuracy": 0.8472217321395874, + "num_tokens": 63057883.0, + "step": 1652 + }, + { + "epoch": 0.21027859051011322, + "grad_norm": 1.6573632955551147, + "learning_rate": 7.002967359050444e-07, + "loss": 0.5034, + "mean_token_accuracy": 0.8312129974365234, + "num_tokens": 63097503.0, + "step": 1653 + }, + { + "epoch": 0.21040580078870372, + "grad_norm": 1.5508941411972046, + "learning_rate": 7.007206443408224e-07, + "loss": 0.4242, + "mean_token_accuracy": 0.8586091995239258, + "num_tokens": 63135265.0, + "step": 1654 + }, + { + "epoch": 0.21053301106729425, + "grad_norm": 1.5229146480560303, + "learning_rate": 7.011445527766002e-07, + "loss": 0.503, + "mean_token_accuracy": 0.834898829460144, + "num_tokens": 63178665.0, + "step": 1655 + }, + { + "epoch": 0.21066022134588475, + "grad_norm": 1.4325406551361084, + "learning_rate": 7.015684612123781e-07, + "loss": 0.4468, + "mean_token_accuracy": 0.853417694568634, + "num_tokens": 63224590.0, + "step": 1656 + }, + { + "epoch": 0.21078743162447525, + "grad_norm": 1.4246962070465088, + "learning_rate": 7.01992369648156e-07, + "loss": 0.4409, + "mean_token_accuracy": 0.8521347045898438, + "num_tokens": 63269024.0, + "step": 1657 + }, + { + "epoch": 0.21091464190306577, + "grad_norm": 1.5281460285186768, + "learning_rate": 7.024162780839339e-07, + "loss": 0.4138, + "mean_token_accuracy": 0.8630419969558716, + "num_tokens": 63306880.0, + "step": 1658 + }, + { + "epoch": 0.21104185218165628, + "grad_norm": 1.7761156558990479, + "learning_rate": 7.028401865197116e-07, + "loss": 0.5044, + "mean_token_accuracy": 0.8315088152885437, + "num_tokens": 63338877.0, + "step": 1659 + }, + { + "epoch": 0.21116906246024678, + "grad_norm": 1.5523806810379028, + "learning_rate": 7.032640949554896e-07, + "loss": 0.4433, + "mean_token_accuracy": 0.8516396284103394, + "num_tokens": 63378434.0, + "step": 1660 + }, + { + "epoch": 0.2112962727388373, + "grad_norm": 1.5848727226257324, + "learning_rate": 7.036880033912674e-07, + "loss": 0.4983, + "mean_token_accuracy": 0.8366951942443848, + "num_tokens": 63418743.0, + "step": 1661 + }, + { + "epoch": 0.2114234830174278, + "grad_norm": 1.4675155878067017, + "learning_rate": 7.041119118270454e-07, + "loss": 0.4074, + "mean_token_accuracy": 0.8621540665626526, + "num_tokens": 63457842.0, + "step": 1662 + }, + { + "epoch": 0.2115506932960183, + "grad_norm": 1.515190839767456, + "learning_rate": 7.045358202628232e-07, + "loss": 0.4481, + "mean_token_accuracy": 0.8530519008636475, + "num_tokens": 63499617.0, + "step": 1663 + }, + { + "epoch": 0.21167790357460883, + "grad_norm": 1.5120302438735962, + "learning_rate": 7.049597286986011e-07, + "loss": 0.4303, + "mean_token_accuracy": 0.8544410467147827, + "num_tokens": 63538930.0, + "step": 1664 + }, + { + "epoch": 0.21180511385319933, + "grad_norm": 1.5152093172073364, + "learning_rate": 7.05383637134379e-07, + "loss": 0.4006, + "mean_token_accuracy": 0.8630194664001465, + "num_tokens": 63576405.0, + "step": 1665 + }, + { + "epoch": 0.21193232413178986, + "grad_norm": 1.760722041130066, + "learning_rate": 7.058075455701568e-07, + "loss": 0.4955, + "mean_token_accuracy": 0.8384740352630615, + "num_tokens": 63609563.0, + "step": 1666 + }, + { + "epoch": 0.21205953441038036, + "grad_norm": 1.5366485118865967, + "learning_rate": 7.062314540059346e-07, + "loss": 0.4478, + "mean_token_accuracy": 0.8501912355422974, + "num_tokens": 63648982.0, + "step": 1667 + }, + { + "epoch": 0.21218674468897086, + "grad_norm": 1.658273458480835, + "learning_rate": 7.066553624417126e-07, + "loss": 0.5049, + "mean_token_accuracy": 0.8341151475906372, + "num_tokens": 63687237.0, + "step": 1668 + }, + { + "epoch": 0.2123139549675614, + "grad_norm": 1.5912890434265137, + "learning_rate": 7.070792708774904e-07, + "loss": 0.4282, + "mean_token_accuracy": 0.8518017530441284, + "num_tokens": 63721259.0, + "step": 1669 + }, + { + "epoch": 0.2124411652461519, + "grad_norm": 1.4557222127914429, + "learning_rate": 7.075031793132684e-07, + "loss": 0.4354, + "mean_token_accuracy": 0.8529659509658813, + "num_tokens": 63762170.0, + "step": 1670 + }, + { + "epoch": 0.2125683755247424, + "grad_norm": 1.478137731552124, + "learning_rate": 7.079270877490462e-07, + "loss": 0.4277, + "mean_token_accuracy": 0.8622918128967285, + "num_tokens": 63802440.0, + "step": 1671 + }, + { + "epoch": 0.21269558580333292, + "grad_norm": 1.5063624382019043, + "learning_rate": 7.08350996184824e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.844402551651001, + "num_tokens": 63842271.0, + "step": 1672 + }, + { + "epoch": 0.21282279608192342, + "grad_norm": 1.5797204971313477, + "learning_rate": 7.08774904620602e-07, + "loss": 0.447, + "mean_token_accuracy": 0.8535555005073547, + "num_tokens": 63877905.0, + "step": 1673 + }, + { + "epoch": 0.21295000636051392, + "grad_norm": 1.7054795026779175, + "learning_rate": 7.091988130563797e-07, + "loss": 0.4756, + "mean_token_accuracy": 0.8395140767097473, + "num_tokens": 63911840.0, + "step": 1674 + }, + { + "epoch": 0.21307721663910445, + "grad_norm": 1.5854684114456177, + "learning_rate": 7.096227214921576e-07, + "loss": 0.4768, + "mean_token_accuracy": 0.8359910249710083, + "num_tokens": 63949751.0, + "step": 1675 + }, + { + "epoch": 0.21320442691769495, + "grad_norm": 1.5313690900802612, + "learning_rate": 7.100466299279355e-07, + "loss": 0.3739, + "mean_token_accuracy": 0.8741925358772278, + "num_tokens": 63986073.0, + "step": 1676 + }, + { + "epoch": 0.21333163719628545, + "grad_norm": 1.4454799890518188, + "learning_rate": 7.104705383637134e-07, + "loss": 0.4639, + "mean_token_accuracy": 0.8490253686904907, + "num_tokens": 64027920.0, + "step": 1677 + }, + { + "epoch": 0.21345884747487598, + "grad_norm": 1.4547617435455322, + "learning_rate": 7.108944467994913e-07, + "loss": 0.4294, + "mean_token_accuracy": 0.8555008172988892, + "num_tokens": 64069861.0, + "step": 1678 + }, + { + "epoch": 0.21358605775346648, + "grad_norm": 1.7314069271087646, + "learning_rate": 7.113183552352692e-07, + "loss": 0.3848, + "mean_token_accuracy": 0.8698270320892334, + "num_tokens": 64100042.0, + "step": 1679 + }, + { + "epoch": 0.21371326803205698, + "grad_norm": 1.5767605304718018, + "learning_rate": 7.11742263671047e-07, + "loss": 0.4499, + "mean_token_accuracy": 0.8507252931594849, + "num_tokens": 64138619.0, + "step": 1680 + }, + { + "epoch": 0.2138404783106475, + "grad_norm": 1.4874287843704224, + "learning_rate": 7.12166172106825e-07, + "loss": 0.4396, + "mean_token_accuracy": 0.8531683683395386, + "num_tokens": 64182342.0, + "step": 1681 + }, + { + "epoch": 0.213967688589238, + "grad_norm": 1.6174534559249878, + "learning_rate": 7.125900805426027e-07, + "loss": 0.4445, + "mean_token_accuracy": 0.8487834930419922, + "num_tokens": 64217975.0, + "step": 1682 + }, + { + "epoch": 0.2140948988678285, + "grad_norm": 1.4926937818527222, + "learning_rate": 7.130139889783806e-07, + "loss": 0.4505, + "mean_token_accuracy": 0.8497343063354492, + "num_tokens": 64261238.0, + "step": 1683 + }, + { + "epoch": 0.21422210914641904, + "grad_norm": 1.4568628072738647, + "learning_rate": 7.134378974141585e-07, + "loss": 0.4394, + "mean_token_accuracy": 0.8547698855400085, + "num_tokens": 64301429.0, + "step": 1684 + }, + { + "epoch": 0.21434931942500954, + "grad_norm": 1.521986484527588, + "learning_rate": 7.138618058499364e-07, + "loss": 0.4094, + "mean_token_accuracy": 0.8640862703323364, + "num_tokens": 64339795.0, + "step": 1685 + }, + { + "epoch": 0.21447652970360004, + "grad_norm": 1.4724074602127075, + "learning_rate": 7.142857142857143e-07, + "loss": 0.4415, + "mean_token_accuracy": 0.8523169755935669, + "num_tokens": 64382104.0, + "step": 1686 + }, + { + "epoch": 0.21460373998219057, + "grad_norm": 1.4174306392669678, + "learning_rate": 7.147096227214922e-07, + "loss": 0.4301, + "mean_token_accuracy": 0.8552597761154175, + "num_tokens": 64423890.0, + "step": 1687 + }, + { + "epoch": 0.21473095026078107, + "grad_norm": 1.5110810995101929, + "learning_rate": 7.1513353115727e-07, + "loss": 0.4015, + "mean_token_accuracy": 0.8642181754112244, + "num_tokens": 64459366.0, + "step": 1688 + }, + { + "epoch": 0.21485816053937157, + "grad_norm": 1.563225507736206, + "learning_rate": 7.155574395930479e-07, + "loss": 0.4577, + "mean_token_accuracy": 0.8492962121963501, + "num_tokens": 64497524.0, + "step": 1689 + }, + { + "epoch": 0.2149853708179621, + "grad_norm": 1.5127184391021729, + "learning_rate": 7.159813480288257e-07, + "loss": 0.4283, + "mean_token_accuracy": 0.8558065891265869, + "num_tokens": 64534883.0, + "step": 1690 + }, + { + "epoch": 0.2151125810965526, + "grad_norm": 1.6683547496795654, + "learning_rate": 7.164052564646035e-07, + "loss": 0.439, + "mean_token_accuracy": 0.8478946089744568, + "num_tokens": 64566266.0, + "step": 1691 + }, + { + "epoch": 0.21523979137514312, + "grad_norm": 1.7017723321914673, + "learning_rate": 7.168291649003815e-07, + "loss": 0.4499, + "mean_token_accuracy": 0.8497837781906128, + "num_tokens": 64600493.0, + "step": 1692 + }, + { + "epoch": 0.21536700165373363, + "grad_norm": 1.4869742393493652, + "learning_rate": 7.172530733361593e-07, + "loss": 0.4328, + "mean_token_accuracy": 0.8643839359283447, + "num_tokens": 64639951.0, + "step": 1693 + }, + { + "epoch": 0.21549421193232413, + "grad_norm": 1.4018336534500122, + "learning_rate": 7.176769817719373e-07, + "loss": 0.4135, + "mean_token_accuracy": 0.8594027757644653, + "num_tokens": 64682160.0, + "step": 1694 + }, + { + "epoch": 0.21562142221091465, + "grad_norm": 1.5992650985717773, + "learning_rate": 7.181008902077151e-07, + "loss": 0.4157, + "mean_token_accuracy": 0.8598721623420715, + "num_tokens": 64718162.0, + "step": 1695 + }, + { + "epoch": 0.21574863248950515, + "grad_norm": 1.550031065940857, + "learning_rate": 7.18524798643493e-07, + "loss": 0.4545, + "mean_token_accuracy": 0.8502528667449951, + "num_tokens": 64756421.0, + "step": 1696 + }, + { + "epoch": 0.21587584276809565, + "grad_norm": 1.7110658884048462, + "learning_rate": 7.189487070792708e-07, + "loss": 0.4697, + "mean_token_accuracy": 0.8467484712600708, + "num_tokens": 64792142.0, + "step": 1697 + }, + { + "epoch": 0.21600305304668618, + "grad_norm": 1.4887908697128296, + "learning_rate": 7.193726155150487e-07, + "loss": 0.3906, + "mean_token_accuracy": 0.867576539516449, + "num_tokens": 64832629.0, + "step": 1698 + }, + { + "epoch": 0.21613026332527668, + "grad_norm": 1.5530338287353516, + "learning_rate": 7.197965239508265e-07, + "loss": 0.3864, + "mean_token_accuracy": 0.869652509689331, + "num_tokens": 64869744.0, + "step": 1699 + }, + { + "epoch": 0.21625747360386718, + "grad_norm": 1.5608932971954346, + "learning_rate": 7.202204323866045e-07, + "loss": 0.4457, + "mean_token_accuracy": 0.8504165410995483, + "num_tokens": 64909872.0, + "step": 1700 + }, + { + "epoch": 0.2163846838824577, + "grad_norm": 1.603812336921692, + "learning_rate": 7.206443408223823e-07, + "loss": 0.411, + "mean_token_accuracy": 0.8613393306732178, + "num_tokens": 64946436.0, + "step": 1701 + }, + { + "epoch": 0.2165118941610482, + "grad_norm": 1.5423132181167603, + "learning_rate": 7.210682492581603e-07, + "loss": 0.4371, + "mean_token_accuracy": 0.8543021082878113, + "num_tokens": 64987834.0, + "step": 1702 + }, + { + "epoch": 0.2166391044396387, + "grad_norm": 1.546086072921753, + "learning_rate": 7.214921576939381e-07, + "loss": 0.4334, + "mean_token_accuracy": 0.8554288148880005, + "num_tokens": 65026880.0, + "step": 1703 + }, + { + "epoch": 0.21676631471822924, + "grad_norm": 1.6609042882919312, + "learning_rate": 7.219160661297159e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.8360833525657654, + "num_tokens": 65063300.0, + "step": 1704 + }, + { + "epoch": 0.21689352499681974, + "grad_norm": 1.667995810508728, + "learning_rate": 7.223399745654938e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.8363164663314819, + "num_tokens": 65102784.0, + "step": 1705 + }, + { + "epoch": 0.21702073527541024, + "grad_norm": 1.5360195636749268, + "learning_rate": 7.227638830012717e-07, + "loss": 0.4401, + "mean_token_accuracy": 0.8563094139099121, + "num_tokens": 65140764.0, + "step": 1706 + }, + { + "epoch": 0.21714794555400077, + "grad_norm": 1.7113893032073975, + "learning_rate": 7.231877914370495e-07, + "loss": 0.4431, + "mean_token_accuracy": 0.8518450856208801, + "num_tokens": 65173764.0, + "step": 1707 + }, + { + "epoch": 0.21727515583259127, + "grad_norm": 1.7345671653747559, + "learning_rate": 7.236116998728275e-07, + "loss": 0.5392, + "mean_token_accuracy": 0.8214230537414551, + "num_tokens": 65211744.0, + "step": 1708 + }, + { + "epoch": 0.21740236611118177, + "grad_norm": 1.3812121152877808, + "learning_rate": 7.240356083086053e-07, + "loss": 0.4186, + "mean_token_accuracy": 0.8556950092315674, + "num_tokens": 65254857.0, + "step": 1709 + }, + { + "epoch": 0.2175295763897723, + "grad_norm": 1.763787031173706, + "learning_rate": 7.244595167443833e-07, + "loss": 0.5345, + "mean_token_accuracy": 0.828999400138855, + "num_tokens": 65287918.0, + "step": 1710 + }, + { + "epoch": 0.2176567866683628, + "grad_norm": 1.7654751539230347, + "learning_rate": 7.248834251801611e-07, + "loss": 0.4345, + "mean_token_accuracy": 0.8539925217628479, + "num_tokens": 65320884.0, + "step": 1711 + }, + { + "epoch": 0.2177839969469533, + "grad_norm": 1.4255131483078003, + "learning_rate": 7.253073336159388e-07, + "loss": 0.4276, + "mean_token_accuracy": 0.8573483228683472, + "num_tokens": 65362640.0, + "step": 1712 + }, + { + "epoch": 0.21791120722554383, + "grad_norm": 1.5943458080291748, + "learning_rate": 7.257312420517168e-07, + "loss": 0.4318, + "mean_token_accuracy": 0.8553571701049805, + "num_tokens": 65397302.0, + "step": 1713 + }, + { + "epoch": 0.21803841750413433, + "grad_norm": 1.515272855758667, + "learning_rate": 7.261551504874946e-07, + "loss": 0.4507, + "mean_token_accuracy": 0.8510380387306213, + "num_tokens": 65438583.0, + "step": 1714 + }, + { + "epoch": 0.21816562778272486, + "grad_norm": 1.4288884401321411, + "learning_rate": 7.265790589232725e-07, + "loss": 0.4416, + "mean_token_accuracy": 0.8505882024765015, + "num_tokens": 65480546.0, + "step": 1715 + }, + { + "epoch": 0.21829283806131536, + "grad_norm": 1.6602956056594849, + "learning_rate": 7.270029673590504e-07, + "loss": 0.4421, + "mean_token_accuracy": 0.8489681482315063, + "num_tokens": 65514862.0, + "step": 1716 + }, + { + "epoch": 0.21842004833990586, + "grad_norm": 1.5239450931549072, + "learning_rate": 7.274268757948283e-07, + "loss": 0.4419, + "mean_token_accuracy": 0.8545336723327637, + "num_tokens": 65556017.0, + "step": 1717 + }, + { + "epoch": 0.2185472586184964, + "grad_norm": 1.5992457866668701, + "learning_rate": 7.278507842306062e-07, + "loss": 0.405, + "mean_token_accuracy": 0.8634523153305054, + "num_tokens": 65591591.0, + "step": 1718 + }, + { + "epoch": 0.2186744688970869, + "grad_norm": 1.5660544633865356, + "learning_rate": 7.282746926663841e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.839827835559845, + "num_tokens": 65634362.0, + "step": 1719 + }, + { + "epoch": 0.2188016791756774, + "grad_norm": 1.7554874420166016, + "learning_rate": 7.286986011021618e-07, + "loss": 0.511, + "mean_token_accuracy": 0.835781455039978, + "num_tokens": 65668843.0, + "step": 1720 + }, + { + "epoch": 0.21892888945426792, + "grad_norm": 1.6113479137420654, + "learning_rate": 7.291225095379398e-07, + "loss": 0.4159, + "mean_token_accuracy": 0.8577380180358887, + "num_tokens": 65708082.0, + "step": 1721 + }, + { + "epoch": 0.21905609973285842, + "grad_norm": 1.528847575187683, + "learning_rate": 7.295464179737176e-07, + "loss": 0.4546, + "mean_token_accuracy": 0.8513222932815552, + "num_tokens": 65748477.0, + "step": 1722 + }, + { + "epoch": 0.21918331001144892, + "grad_norm": 1.4611979722976685, + "learning_rate": 7.299703264094955e-07, + "loss": 0.3902, + "mean_token_accuracy": 0.8696866035461426, + "num_tokens": 65789665.0, + "step": 1723 + }, + { + "epoch": 0.21931052029003945, + "grad_norm": 1.5576294660568237, + "learning_rate": 7.303942348452734e-07, + "loss": 0.4084, + "mean_token_accuracy": 0.8618805408477783, + "num_tokens": 65824062.0, + "step": 1724 + }, + { + "epoch": 0.21943773056862995, + "grad_norm": 1.7735581398010254, + "learning_rate": 7.308181432810513e-07, + "loss": 0.4368, + "mean_token_accuracy": 0.8558220863342285, + "num_tokens": 65858880.0, + "step": 1725 + }, + { + "epoch": 0.21956494084722045, + "grad_norm": 1.6679580211639404, + "learning_rate": 7.312420517168292e-07, + "loss": 0.3986, + "mean_token_accuracy": 0.8656876087188721, + "num_tokens": 65893836.0, + "step": 1726 + }, + { + "epoch": 0.21969215112581097, + "grad_norm": 1.62832772731781, + "learning_rate": 7.31665960152607e-07, + "loss": 0.443, + "mean_token_accuracy": 0.8500525951385498, + "num_tokens": 65930040.0, + "step": 1727 + }, + { + "epoch": 0.21981936140440148, + "grad_norm": 1.4740958213806152, + "learning_rate": 7.320898685883848e-07, + "loss": 0.5209, + "mean_token_accuracy": 0.8267167806625366, + "num_tokens": 65972652.0, + "step": 1728 + }, + { + "epoch": 0.21994657168299198, + "grad_norm": 1.7205142974853516, + "learning_rate": 7.325137770241628e-07, + "loss": 0.4714, + "mean_token_accuracy": 0.8421715497970581, + "num_tokens": 66007606.0, + "step": 1729 + }, + { + "epoch": 0.2200737819615825, + "grad_norm": 1.5102556943893433, + "learning_rate": 7.329376854599406e-07, + "loss": 0.4668, + "mean_token_accuracy": 0.8418537974357605, + "num_tokens": 66052501.0, + "step": 1730 + }, + { + "epoch": 0.220200992240173, + "grad_norm": 1.6986851692199707, + "learning_rate": 7.333615938957184e-07, + "loss": 0.4349, + "mean_token_accuracy": 0.8516228795051575, + "num_tokens": 66086378.0, + "step": 1731 + }, + { + "epoch": 0.2203282025187635, + "grad_norm": 1.5485379695892334, + "learning_rate": 7.337855023314964e-07, + "loss": 0.4144, + "mean_token_accuracy": 0.8596378564834595, + "num_tokens": 66127081.0, + "step": 1732 + }, + { + "epoch": 0.22045541279735403, + "grad_norm": 1.5541027784347534, + "learning_rate": 7.342094107672742e-07, + "loss": 0.4136, + "mean_token_accuracy": 0.8587539792060852, + "num_tokens": 66165351.0, + "step": 1733 + }, + { + "epoch": 0.22058262307594453, + "grad_norm": 1.5416697263717651, + "learning_rate": 7.346333192030522e-07, + "loss": 0.4628, + "mean_token_accuracy": 0.8508793711662292, + "num_tokens": 66206427.0, + "step": 1734 + }, + { + "epoch": 0.22070983335453503, + "grad_norm": 1.540522575378418, + "learning_rate": 7.350572276388299e-07, + "loss": 0.4357, + "mean_token_accuracy": 0.852328896522522, + "num_tokens": 66245308.0, + "step": 1735 + }, + { + "epoch": 0.22083704363312556, + "grad_norm": 1.5238457918167114, + "learning_rate": 7.354811360746078e-07, + "loss": 0.4533, + "mean_token_accuracy": 0.8482784032821655, + "num_tokens": 66284585.0, + "step": 1736 + }, + { + "epoch": 0.22096425391171606, + "grad_norm": 1.565850853919983, + "learning_rate": 7.359050445103857e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.84190833568573, + "num_tokens": 66326688.0, + "step": 1737 + }, + { + "epoch": 0.22109146419030656, + "grad_norm": 1.4798698425292969, + "learning_rate": 7.363289529461636e-07, + "loss": 0.4724, + "mean_token_accuracy": 0.8421645164489746, + "num_tokens": 66369618.0, + "step": 1738 + }, + { + "epoch": 0.2212186744688971, + "grad_norm": 1.5571345090866089, + "learning_rate": 7.367528613819415e-07, + "loss": 0.417, + "mean_token_accuracy": 0.8604458570480347, + "num_tokens": 66402601.0, + "step": 1739 + }, + { + "epoch": 0.2213458847474876, + "grad_norm": 1.4608248472213745, + "learning_rate": 7.371767698177194e-07, + "loss": 0.3889, + "mean_token_accuracy": 0.8680803775787354, + "num_tokens": 66442510.0, + "step": 1740 + }, + { + "epoch": 0.22147309502607812, + "grad_norm": 1.4257158041000366, + "learning_rate": 7.376006782534972e-07, + "loss": 0.4546, + "mean_token_accuracy": 0.84932541847229, + "num_tokens": 66486807.0, + "step": 1741 + }, + { + "epoch": 0.22160030530466862, + "grad_norm": 1.4458374977111816, + "learning_rate": 7.380245866892751e-07, + "loss": 0.4086, + "mean_token_accuracy": 0.8631366491317749, + "num_tokens": 66528067.0, + "step": 1742 + }, + { + "epoch": 0.22172751558325912, + "grad_norm": 1.5166436433792114, + "learning_rate": 7.384484951250529e-07, + "loss": 0.4025, + "mean_token_accuracy": 0.8674638271331787, + "num_tokens": 66564397.0, + "step": 1743 + }, + { + "epoch": 0.22185472586184965, + "grad_norm": 1.5901614427566528, + "learning_rate": 7.388724035608308e-07, + "loss": 0.4258, + "mean_token_accuracy": 0.8592585325241089, + "num_tokens": 66600872.0, + "step": 1744 + }, + { + "epoch": 0.22198193614044015, + "grad_norm": 1.4521886110305786, + "learning_rate": 7.392963119966087e-07, + "loss": 0.43, + "mean_token_accuracy": 0.854019284248352, + "num_tokens": 66643698.0, + "step": 1745 + }, + { + "epoch": 0.22210914641903065, + "grad_norm": 1.6821131706237793, + "learning_rate": 7.397202204323866e-07, + "loss": 0.4384, + "mean_token_accuracy": 0.8556603193283081, + "num_tokens": 66677814.0, + "step": 1746 + }, + { + "epoch": 0.22223635669762118, + "grad_norm": 1.5072293281555176, + "learning_rate": 7.401441288681645e-07, + "loss": 0.4113, + "mean_token_accuracy": 0.8590794801712036, + "num_tokens": 66717466.0, + "step": 1747 + }, + { + "epoch": 0.22236356697621168, + "grad_norm": 1.5130842924118042, + "learning_rate": 7.405680373039424e-07, + "loss": 0.4323, + "mean_token_accuracy": 0.8549609780311584, + "num_tokens": 66755884.0, + "step": 1748 + }, + { + "epoch": 0.22249077725480218, + "grad_norm": 1.577709674835205, + "learning_rate": 7.409919457397202e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.8386162519454956, + "num_tokens": 66797752.0, + "step": 1749 + }, + { + "epoch": 0.2226179875333927, + "grad_norm": 1.6043668985366821, + "learning_rate": 7.414158541754981e-07, + "loss": 0.4481, + "mean_token_accuracy": 0.8506073355674744, + "num_tokens": 66831546.0, + "step": 1750 + }, + { + "epoch": 0.2227451978119832, + "grad_norm": 1.5827261209487915, + "learning_rate": 7.418397626112759e-07, + "loss": 0.4555, + "mean_token_accuracy": 0.8454678654670715, + "num_tokens": 66871713.0, + "step": 1751 + }, + { + "epoch": 0.2228724080905737, + "grad_norm": 1.5557847023010254, + "learning_rate": 7.422636710470537e-07, + "loss": 0.412, + "mean_token_accuracy": 0.8626123070716858, + "num_tokens": 66912247.0, + "step": 1752 + }, + { + "epoch": 0.22299961836916424, + "grad_norm": 1.551505208015442, + "learning_rate": 7.426875794828317e-07, + "loss": 0.451, + "mean_token_accuracy": 0.8498809337615967, + "num_tokens": 66956780.0, + "step": 1753 + }, + { + "epoch": 0.22312682864775474, + "grad_norm": 1.4734724760055542, + "learning_rate": 7.431114879186095e-07, + "loss": 0.3896, + "mean_token_accuracy": 0.8700709342956543, + "num_tokens": 66996904.0, + "step": 1754 + }, + { + "epoch": 0.22325403892634524, + "grad_norm": 1.5915775299072266, + "learning_rate": 7.435353963543875e-07, + "loss": 0.4255, + "mean_token_accuracy": 0.8580414652824402, + "num_tokens": 67034356.0, + "step": 1755 + }, + { + "epoch": 0.22338124920493577, + "grad_norm": 1.6220731735229492, + "learning_rate": 7.439593047901653e-07, + "loss": 0.4361, + "mean_token_accuracy": 0.8547822833061218, + "num_tokens": 67068182.0, + "step": 1756 + }, + { + "epoch": 0.22350845948352627, + "grad_norm": 1.6866202354431152, + "learning_rate": 7.443832132259431e-07, + "loss": 0.4911, + "mean_token_accuracy": 0.836930513381958, + "num_tokens": 67104884.0, + "step": 1757 + }, + { + "epoch": 0.22363566976211677, + "grad_norm": 1.5070246458053589, + "learning_rate": 7.44807121661721e-07, + "loss": 0.4268, + "mean_token_accuracy": 0.8571161031723022, + "num_tokens": 67145547.0, + "step": 1758 + }, + { + "epoch": 0.2237628800407073, + "grad_norm": 1.4955860376358032, + "learning_rate": 7.452310300974989e-07, + "loss": 0.4769, + "mean_token_accuracy": 0.8474727869033813, + "num_tokens": 67188423.0, + "step": 1759 + }, + { + "epoch": 0.2238900903192978, + "grad_norm": 1.6648364067077637, + "learning_rate": 7.456549385332767e-07, + "loss": 0.4575, + "mean_token_accuracy": 0.8501337170600891, + "num_tokens": 67220534.0, + "step": 1760 + }, + { + "epoch": 0.2240173005978883, + "grad_norm": 1.5580971240997314, + "learning_rate": 7.460788469690547e-07, + "loss": 0.3918, + "mean_token_accuracy": 0.8695850372314453, + "num_tokens": 67256012.0, + "step": 1761 + }, + { + "epoch": 0.22414451087647883, + "grad_norm": 1.574893593788147, + "learning_rate": 7.465027554048325e-07, + "loss": 0.4742, + "mean_token_accuracy": 0.8429375290870667, + "num_tokens": 67293671.0, + "step": 1762 + }, + { + "epoch": 0.22427172115506933, + "grad_norm": 1.404037594795227, + "learning_rate": 7.469266638406105e-07, + "loss": 0.4246, + "mean_token_accuracy": 0.8566288352012634, + "num_tokens": 67340514.0, + "step": 1763 + }, + { + "epoch": 0.22439893143365983, + "grad_norm": 1.6428680419921875, + "learning_rate": 7.473505722763883e-07, + "loss": 0.3981, + "mean_token_accuracy": 0.859990119934082, + "num_tokens": 67370882.0, + "step": 1764 + }, + { + "epoch": 0.22452614171225035, + "grad_norm": 1.4917441606521606, + "learning_rate": 7.477744807121661e-07, + "loss": 0.428, + "mean_token_accuracy": 0.8534855842590332, + "num_tokens": 67408968.0, + "step": 1765 + }, + { + "epoch": 0.22465335199084085, + "grad_norm": 1.4929304122924805, + "learning_rate": 7.48198389147944e-07, + "loss": 0.4822, + "mean_token_accuracy": 0.8399325609207153, + "num_tokens": 67449528.0, + "step": 1766 + }, + { + "epoch": 0.22478056226943138, + "grad_norm": 1.462470531463623, + "learning_rate": 7.486222975837219e-07, + "loss": 0.4557, + "mean_token_accuracy": 0.8501946926116943, + "num_tokens": 67491306.0, + "step": 1767 + }, + { + "epoch": 0.22490777254802188, + "grad_norm": 1.8014394044876099, + "learning_rate": 7.490462060194997e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.8434485197067261, + "num_tokens": 67521783.0, + "step": 1768 + }, + { + "epoch": 0.22503498282661238, + "grad_norm": 1.5765843391418457, + "learning_rate": 7.494701144552777e-07, + "loss": 0.4467, + "mean_token_accuracy": 0.8552910089492798, + "num_tokens": 67558501.0, + "step": 1769 + }, + { + "epoch": 0.2251621931052029, + "grad_norm": 1.505649447441101, + "learning_rate": 7.498940228910555e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8369439244270325, + "num_tokens": 67600562.0, + "step": 1770 + }, + { + "epoch": 0.2252894033837934, + "grad_norm": 1.4403865337371826, + "learning_rate": 7.503179313268335e-07, + "loss": 0.4022, + "mean_token_accuracy": 0.86515212059021, + "num_tokens": 67644170.0, + "step": 1771 + }, + { + "epoch": 0.2254166136623839, + "grad_norm": 1.5754845142364502, + "learning_rate": 7.507418397626113e-07, + "loss": 0.4202, + "mean_token_accuracy": 0.8598764538764954, + "num_tokens": 67680227.0, + "step": 1772 + }, + { + "epoch": 0.22554382394097444, + "grad_norm": 1.5369619131088257, + "learning_rate": 7.51165748198389e-07, + "loss": 0.5136, + "mean_token_accuracy": 0.831362247467041, + "num_tokens": 67721471.0, + "step": 1773 + }, + { + "epoch": 0.22567103421956494, + "grad_norm": 1.5781676769256592, + "learning_rate": 7.51589656634167e-07, + "loss": 0.4316, + "mean_token_accuracy": 0.8542802333831787, + "num_tokens": 67759239.0, + "step": 1774 + }, + { + "epoch": 0.22579824449815544, + "grad_norm": 1.6223011016845703, + "learning_rate": 7.520135650699448e-07, + "loss": 0.4126, + "mean_token_accuracy": 0.8624079823493958, + "num_tokens": 67794554.0, + "step": 1775 + }, + { + "epoch": 0.22592545477674597, + "grad_norm": 1.5500247478485107, + "learning_rate": 7.524374735057227e-07, + "loss": 0.4304, + "mean_token_accuracy": 0.8632278442382812, + "num_tokens": 67831158.0, + "step": 1776 + }, + { + "epoch": 0.22605266505533647, + "grad_norm": 1.6522572040557861, + "learning_rate": 7.528613819415006e-07, + "loss": 0.5, + "mean_token_accuracy": 0.8327044248580933, + "num_tokens": 67870262.0, + "step": 1777 + }, + { + "epoch": 0.22617987533392697, + "grad_norm": 1.5848925113677979, + "learning_rate": 7.532852903772785e-07, + "loss": 0.4608, + "mean_token_accuracy": 0.8470033407211304, + "num_tokens": 67912055.0, + "step": 1778 + }, + { + "epoch": 0.2263070856125175, + "grad_norm": 1.4715874195098877, + "learning_rate": 7.537091988130564e-07, + "loss": 0.4101, + "mean_token_accuracy": 0.8611351251602173, + "num_tokens": 67948801.0, + "step": 1779 + }, + { + "epoch": 0.226434295891108, + "grad_norm": 1.5407570600509644, + "learning_rate": 7.541331072488342e-07, + "loss": 0.4306, + "mean_token_accuracy": 0.8553741574287415, + "num_tokens": 67987530.0, + "step": 1780 + }, + { + "epoch": 0.2265615061696985, + "grad_norm": 1.5904818773269653, + "learning_rate": 7.54557015684612e-07, + "loss": 0.4575, + "mean_token_accuracy": 0.8499107360839844, + "num_tokens": 68027335.0, + "step": 1781 + }, + { + "epoch": 0.22668871644828903, + "grad_norm": 1.7126330137252808, + "learning_rate": 7.5498092412039e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.8432131409645081, + "num_tokens": 68060595.0, + "step": 1782 + }, + { + "epoch": 0.22681592672687953, + "grad_norm": 1.5922935009002686, + "learning_rate": 7.554048325561678e-07, + "loss": 0.4202, + "mean_token_accuracy": 0.8575104475021362, + "num_tokens": 68094520.0, + "step": 1783 + }, + { + "epoch": 0.22694313700547003, + "grad_norm": 1.563988447189331, + "learning_rate": 7.558287409919457e-07, + "loss": 0.4095, + "mean_token_accuracy": 0.8612227439880371, + "num_tokens": 68129755.0, + "step": 1784 + }, + { + "epoch": 0.22707034728406056, + "grad_norm": 1.6037006378173828, + "learning_rate": 7.562526494277236e-07, + "loss": 0.4456, + "mean_token_accuracy": 0.8518613576889038, + "num_tokens": 68166045.0, + "step": 1785 + }, + { + "epoch": 0.22719755756265106, + "grad_norm": 1.5568821430206299, + "learning_rate": 7.566765578635015e-07, + "loss": 0.4142, + "mean_token_accuracy": 0.8616669178009033, + "num_tokens": 68205172.0, + "step": 1786 + }, + { + "epoch": 0.22732476784124156, + "grad_norm": 1.5458862781524658, + "learning_rate": 7.571004662992794e-07, + "loss": 0.4692, + "mean_token_accuracy": 0.8408451080322266, + "num_tokens": 68246147.0, + "step": 1787 + }, + { + "epoch": 0.2274519781198321, + "grad_norm": 1.5276137590408325, + "learning_rate": 7.575243747350572e-07, + "loss": 0.4385, + "mean_token_accuracy": 0.8553746938705444, + "num_tokens": 68284744.0, + "step": 1788 + }, + { + "epoch": 0.2275791883984226, + "grad_norm": 1.542992353439331, + "learning_rate": 7.57948283170835e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.8421084880828857, + "num_tokens": 68325269.0, + "step": 1789 + }, + { + "epoch": 0.2277063986770131, + "grad_norm": 1.5221258401870728, + "learning_rate": 7.58372191606613e-07, + "loss": 0.4136, + "mean_token_accuracy": 0.8599923849105835, + "num_tokens": 68365790.0, + "step": 1790 + }, + { + "epoch": 0.22783360895560362, + "grad_norm": 1.6144307851791382, + "learning_rate": 7.587961000423908e-07, + "loss": 0.4716, + "mean_token_accuracy": 0.8459774851799011, + "num_tokens": 68404979.0, + "step": 1791 + }, + { + "epoch": 0.22796081923419412, + "grad_norm": 1.5518661737442017, + "learning_rate": 7.592200084781686e-07, + "loss": 0.4574, + "mean_token_accuracy": 0.8469854593276978, + "num_tokens": 68445892.0, + "step": 1792 + }, + { + "epoch": 0.22808802951278465, + "grad_norm": 1.473571538925171, + "learning_rate": 7.596439169139466e-07, + "loss": 0.4422, + "mean_token_accuracy": 0.8533633947372437, + "num_tokens": 68487817.0, + "step": 1793 + }, + { + "epoch": 0.22821523979137515, + "grad_norm": 1.5876238346099854, + "learning_rate": 7.600678253497244e-07, + "loss": 0.4373, + "mean_token_accuracy": 0.8559613823890686, + "num_tokens": 68525642.0, + "step": 1794 + }, + { + "epoch": 0.22834245006996565, + "grad_norm": 1.6435920000076294, + "learning_rate": 7.604917337855023e-07, + "loss": 0.4985, + "mean_token_accuracy": 0.8285691738128662, + "num_tokens": 68562914.0, + "step": 1795 + }, + { + "epoch": 0.22846966034855618, + "grad_norm": 1.5692323446273804, + "learning_rate": 7.609156422212801e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.8356530666351318, + "num_tokens": 68603649.0, + "step": 1796 + }, + { + "epoch": 0.22859687062714668, + "grad_norm": 1.7014141082763672, + "learning_rate": 7.61339550657058e-07, + "loss": 0.4045, + "mean_token_accuracy": 0.8649402856826782, + "num_tokens": 68635941.0, + "step": 1797 + }, + { + "epoch": 0.22872408090573718, + "grad_norm": 1.676896095275879, + "learning_rate": 7.617634590928359e-07, + "loss": 0.4448, + "mean_token_accuracy": 0.8542836904525757, + "num_tokens": 68668798.0, + "step": 1798 + }, + { + "epoch": 0.2288512911843277, + "grad_norm": 1.5786718130111694, + "learning_rate": 7.621873675286138e-07, + "loss": 0.4593, + "mean_token_accuracy": 0.8488378524780273, + "num_tokens": 68705559.0, + "step": 1799 + }, + { + "epoch": 0.2289785014629182, + "grad_norm": 1.443454384803772, + "learning_rate": 7.626112759643916e-07, + "loss": 0.4708, + "mean_token_accuracy": 0.8412899971008301, + "num_tokens": 68751555.0, + "step": 1800 + }, + { + "epoch": 0.2291057117415087, + "grad_norm": 1.466945767402649, + "learning_rate": 7.630351844001696e-07, + "loss": 0.3807, + "mean_token_accuracy": 0.8694033622741699, + "num_tokens": 68791184.0, + "step": 1801 + }, + { + "epoch": 0.22923292202009923, + "grad_norm": 1.474729299545288, + "learning_rate": 7.634590928359474e-07, + "loss": 0.3975, + "mean_token_accuracy": 0.8665494918823242, + "num_tokens": 68832126.0, + "step": 1802 + }, + { + "epoch": 0.22936013229868973, + "grad_norm": 1.6105595827102661, + "learning_rate": 7.638830012717253e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.842413604259491, + "num_tokens": 68872981.0, + "step": 1803 + }, + { + "epoch": 0.22948734257728023, + "grad_norm": 1.6172218322753906, + "learning_rate": 7.643069097075031e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.8422605991363525, + "num_tokens": 68911300.0, + "step": 1804 + }, + { + "epoch": 0.22961455285587076, + "grad_norm": 1.6046299934387207, + "learning_rate": 7.64730818143281e-07, + "loss": 0.4728, + "mean_token_accuracy": 0.8435044884681702, + "num_tokens": 68947515.0, + "step": 1805 + }, + { + "epoch": 0.22974176313446126, + "grad_norm": 1.4703384637832642, + "learning_rate": 7.651547265790589e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.8374252319335938, + "num_tokens": 68992227.0, + "step": 1806 + }, + { + "epoch": 0.22986897341305176, + "grad_norm": 1.5508687496185303, + "learning_rate": 7.655786350148368e-07, + "loss": 0.4631, + "mean_token_accuracy": 0.8458232879638672, + "num_tokens": 69031529.0, + "step": 1807 + }, + { + "epoch": 0.2299961836916423, + "grad_norm": 1.6065220832824707, + "learning_rate": 7.660025434506146e-07, + "loss": 0.4791, + "mean_token_accuracy": 0.8401684761047363, + "num_tokens": 69068685.0, + "step": 1808 + }, + { + "epoch": 0.2301233939702328, + "grad_norm": 1.5028347969055176, + "learning_rate": 7.664264518863926e-07, + "loss": 0.4213, + "mean_token_accuracy": 0.8588160276412964, + "num_tokens": 69109926.0, + "step": 1809 + }, + { + "epoch": 0.2302506042488233, + "grad_norm": 1.530578374862671, + "learning_rate": 7.668503603221704e-07, + "loss": 0.3933, + "mean_token_accuracy": 0.8656700849533081, + "num_tokens": 69145900.0, + "step": 1810 + }, + { + "epoch": 0.23037781452741382, + "grad_norm": 1.390392541885376, + "learning_rate": 7.672742687579483e-07, + "loss": 0.4239, + "mean_token_accuracy": 0.8583178520202637, + "num_tokens": 69191279.0, + "step": 1811 + }, + { + "epoch": 0.23050502480600432, + "grad_norm": 1.6380081176757812, + "learning_rate": 7.676981771937261e-07, + "loss": 0.4127, + "mean_token_accuracy": 0.8607524633407593, + "num_tokens": 69225649.0, + "step": 1812 + }, + { + "epoch": 0.23063223508459482, + "grad_norm": 1.4606726169586182, + "learning_rate": 7.681220856295039e-07, + "loss": 0.4485, + "mean_token_accuracy": 0.8514024019241333, + "num_tokens": 69268088.0, + "step": 1813 + }, + { + "epoch": 0.23075944536318535, + "grad_norm": 1.437394618988037, + "learning_rate": 7.685459940652819e-07, + "loss": 0.4699, + "mean_token_accuracy": 0.8438758850097656, + "num_tokens": 69312271.0, + "step": 1814 + }, + { + "epoch": 0.23088665564177585, + "grad_norm": 1.6858352422714233, + "learning_rate": 7.689699025010597e-07, + "loss": 0.4956, + "mean_token_accuracy": 0.8360956311225891, + "num_tokens": 69351470.0, + "step": 1815 + }, + { + "epoch": 0.23101386592036638, + "grad_norm": 1.76753830909729, + "learning_rate": 7.693938109368376e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8403505086898804, + "num_tokens": 69383869.0, + "step": 1816 + }, + { + "epoch": 0.23114107619895688, + "grad_norm": 1.5104469060897827, + "learning_rate": 7.698177193726155e-07, + "loss": 0.472, + "mean_token_accuracy": 0.8463857769966125, + "num_tokens": 69424584.0, + "step": 1817 + }, + { + "epoch": 0.23126828647754738, + "grad_norm": 1.7755426168441772, + "learning_rate": 7.702416278083933e-07, + "loss": 0.4926, + "mean_token_accuracy": 0.8358910083770752, + "num_tokens": 69458393.0, + "step": 1818 + }, + { + "epoch": 0.2313954967561379, + "grad_norm": 1.5977177619934082, + "learning_rate": 7.706655362441712e-07, + "loss": 0.4428, + "mean_token_accuracy": 0.8509474396705627, + "num_tokens": 69494102.0, + "step": 1819 + }, + { + "epoch": 0.2315227070347284, + "grad_norm": 1.4050986766815186, + "learning_rate": 7.710894446799491e-07, + "loss": 0.404, + "mean_token_accuracy": 0.8646776676177979, + "num_tokens": 69536041.0, + "step": 1820 + }, + { + "epoch": 0.2316499173133189, + "grad_norm": 1.6598010063171387, + "learning_rate": 7.715133531157269e-07, + "loss": 0.4092, + "mean_token_accuracy": 0.8603371381759644, + "num_tokens": 69570874.0, + "step": 1821 + }, + { + "epoch": 0.23177712759190944, + "grad_norm": 1.5148632526397705, + "learning_rate": 7.719372615515049e-07, + "loss": 0.4399, + "mean_token_accuracy": 0.8529102206230164, + "num_tokens": 69611597.0, + "step": 1822 + }, + { + "epoch": 0.23190433787049994, + "grad_norm": 1.3336414098739624, + "learning_rate": 7.723611699872827e-07, + "loss": 0.3443, + "mean_token_accuracy": 0.8835002183914185, + "num_tokens": 69653728.0, + "step": 1823 + }, + { + "epoch": 0.23203154814909044, + "grad_norm": 1.4649349451065063, + "learning_rate": 7.727850784230606e-07, + "loss": 0.4405, + "mean_token_accuracy": 0.8535760641098022, + "num_tokens": 69694138.0, + "step": 1824 + }, + { + "epoch": 0.23215875842768097, + "grad_norm": 1.5223679542541504, + "learning_rate": 7.732089868588385e-07, + "loss": 0.4651, + "mean_token_accuracy": 0.8496693968772888, + "num_tokens": 69737329.0, + "step": 1825 + }, + { + "epoch": 0.23228596870627147, + "grad_norm": 1.708791732788086, + "learning_rate": 7.736328952946163e-07, + "loss": 0.4886, + "mean_token_accuracy": 0.8388466835021973, + "num_tokens": 69772122.0, + "step": 1826 + }, + { + "epoch": 0.23241317898486197, + "grad_norm": 1.5366023778915405, + "learning_rate": 7.740568037303942e-07, + "loss": 0.449, + "mean_token_accuracy": 0.8493705987930298, + "num_tokens": 69810025.0, + "step": 1827 + }, + { + "epoch": 0.2325403892634525, + "grad_norm": 1.5692603588104248, + "learning_rate": 7.744807121661721e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.839017391204834, + "num_tokens": 69851187.0, + "step": 1828 + }, + { + "epoch": 0.232667599542043, + "grad_norm": 1.6647984981536865, + "learning_rate": 7.749046206019499e-07, + "loss": 0.4217, + "mean_token_accuracy": 0.8511554002761841, + "num_tokens": 69883865.0, + "step": 1829 + }, + { + "epoch": 0.2327948098206335, + "grad_norm": 1.468451738357544, + "learning_rate": 7.753285290377279e-07, + "loss": 0.3824, + "mean_token_accuracy": 0.87049400806427, + "num_tokens": 69927217.0, + "step": 1830 + }, + { + "epoch": 0.23292202009922403, + "grad_norm": 1.596435546875, + "learning_rate": 7.757524374735057e-07, + "loss": 0.4192, + "mean_token_accuracy": 0.8548140525817871, + "num_tokens": 69962482.0, + "step": 1831 + }, + { + "epoch": 0.23304923037781453, + "grad_norm": 1.4835190773010254, + "learning_rate": 7.761763459092836e-07, + "loss": 0.4328, + "mean_token_accuracy": 0.8556873798370361, + "num_tokens": 70006224.0, + "step": 1832 + }, + { + "epoch": 0.23317644065640503, + "grad_norm": 1.5869959592819214, + "learning_rate": 7.766002543450614e-07, + "loss": 0.4119, + "mean_token_accuracy": 0.8613778352737427, + "num_tokens": 70043352.0, + "step": 1833 + }, + { + "epoch": 0.23330365093499555, + "grad_norm": 1.4208500385284424, + "learning_rate": 7.770241627808392e-07, + "loss": 0.3743, + "mean_token_accuracy": 0.8739657402038574, + "num_tokens": 70084915.0, + "step": 1834 + }, + { + "epoch": 0.23343086121358606, + "grad_norm": 1.5370911359786987, + "learning_rate": 7.774480712166172e-07, + "loss": 0.4591, + "mean_token_accuracy": 0.8478867411613464, + "num_tokens": 70125427.0, + "step": 1835 + }, + { + "epoch": 0.23355807149217656, + "grad_norm": 1.7477420568466187, + "learning_rate": 7.77871979652395e-07, + "loss": 0.4955, + "mean_token_accuracy": 0.8321378231048584, + "num_tokens": 70156361.0, + "step": 1836 + }, + { + "epoch": 0.23368528177076708, + "grad_norm": 1.5615991353988647, + "learning_rate": 7.782958880881729e-07, + "loss": 0.4511, + "mean_token_accuracy": 0.8516944050788879, + "num_tokens": 70197690.0, + "step": 1837 + }, + { + "epoch": 0.23381249204935758, + "grad_norm": 1.5063979625701904, + "learning_rate": 7.787197965239508e-07, + "loss": 0.4561, + "mean_token_accuracy": 0.8495064973831177, + "num_tokens": 70240149.0, + "step": 1838 + }, + { + "epoch": 0.23393970232794808, + "grad_norm": 1.5265276432037354, + "learning_rate": 7.791437049597287e-07, + "loss": 0.4058, + "mean_token_accuracy": 0.8605527281761169, + "num_tokens": 70278752.0, + "step": 1839 + }, + { + "epoch": 0.2340669126065386, + "grad_norm": 1.5791490077972412, + "learning_rate": 7.795676133955065e-07, + "loss": 0.4728, + "mean_token_accuracy": 0.8442311882972717, + "num_tokens": 70317152.0, + "step": 1840 + }, + { + "epoch": 0.2341941228851291, + "grad_norm": 1.687628984451294, + "learning_rate": 7.799915218312844e-07, + "loss": 0.3987, + "mean_token_accuracy": 0.8682170510292053, + "num_tokens": 70348858.0, + "step": 1841 + }, + { + "epoch": 0.23432133316371964, + "grad_norm": 1.5487709045410156, + "learning_rate": 7.804154302670622e-07, + "loss": 0.451, + "mean_token_accuracy": 0.8476625680923462, + "num_tokens": 70392650.0, + "step": 1842 + }, + { + "epoch": 0.23444854344231014, + "grad_norm": 1.6182200908660889, + "learning_rate": 7.808393387028402e-07, + "loss": 0.4212, + "mean_token_accuracy": 0.8549467921257019, + "num_tokens": 70426309.0, + "step": 1843 + }, + { + "epoch": 0.23457575372090064, + "grad_norm": 1.492679476737976, + "learning_rate": 7.81263247138618e-07, + "loss": 0.4509, + "mean_token_accuracy": 0.8484513759613037, + "num_tokens": 70465411.0, + "step": 1844 + }, + { + "epoch": 0.23470296399949117, + "grad_norm": 1.4937186241149902, + "learning_rate": 7.816871555743959e-07, + "loss": 0.4093, + "mean_token_accuracy": 0.866208553314209, + "num_tokens": 70504978.0, + "step": 1845 + }, + { + "epoch": 0.23483017427808167, + "grad_norm": 1.6016952991485596, + "learning_rate": 7.821110640101738e-07, + "loss": 0.4052, + "mean_token_accuracy": 0.8606200218200684, + "num_tokens": 70538962.0, + "step": 1846 + }, + { + "epoch": 0.23495738455667217, + "grad_norm": 1.6395277976989746, + "learning_rate": 7.825349724459517e-07, + "loss": 0.5062, + "mean_token_accuracy": 0.8329029679298401, + "num_tokens": 70576384.0, + "step": 1847 + }, + { + "epoch": 0.2350845948352627, + "grad_norm": 1.5547269582748413, + "learning_rate": 7.829588808817294e-07, + "loss": 0.5085, + "mean_token_accuracy": 0.8319579362869263, + "num_tokens": 70619840.0, + "step": 1848 + }, + { + "epoch": 0.2352118051138532, + "grad_norm": 1.4105790853500366, + "learning_rate": 7.833827893175074e-07, + "loss": 0.4047, + "mean_token_accuracy": 0.8630111217498779, + "num_tokens": 70660609.0, + "step": 1849 + }, + { + "epoch": 0.2353390153924437, + "grad_norm": 1.5513355731964111, + "learning_rate": 7.838066977532852e-07, + "loss": 0.42, + "mean_token_accuracy": 0.8579853773117065, + "num_tokens": 70704359.0, + "step": 1850 + }, + { + "epoch": 0.23546622567103423, + "grad_norm": 1.5211186408996582, + "learning_rate": 7.842306061890632e-07, + "loss": 0.4376, + "mean_token_accuracy": 0.8558104634284973, + "num_tokens": 70742328.0, + "step": 1851 + }, + { + "epoch": 0.23559343594962473, + "grad_norm": 1.5718944072723389, + "learning_rate": 7.84654514624841e-07, + "loss": 0.4387, + "mean_token_accuracy": 0.856460452079773, + "num_tokens": 70779796.0, + "step": 1852 + }, + { + "epoch": 0.23572064622821523, + "grad_norm": 1.5399878025054932, + "learning_rate": 7.850784230606188e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8381005525588989, + "num_tokens": 70819494.0, + "step": 1853 + }, + { + "epoch": 0.23584785650680576, + "grad_norm": 1.6548389196395874, + "learning_rate": 7.855023314963968e-07, + "loss": 0.4201, + "mean_token_accuracy": 0.8596430420875549, + "num_tokens": 70856264.0, + "step": 1854 + }, + { + "epoch": 0.23597506678539626, + "grad_norm": 1.4832700490951538, + "learning_rate": 7.859262399321746e-07, + "loss": 0.4136, + "mean_token_accuracy": 0.859167218208313, + "num_tokens": 70898120.0, + "step": 1855 + }, + { + "epoch": 0.23610227706398676, + "grad_norm": 1.654232382774353, + "learning_rate": 7.863501483679524e-07, + "loss": 0.4543, + "mean_token_accuracy": 0.850623607635498, + "num_tokens": 70934625.0, + "step": 1856 + }, + { + "epoch": 0.2362294873425773, + "grad_norm": 1.6046175956726074, + "learning_rate": 7.867740568037303e-07, + "loss": 0.4058, + "mean_token_accuracy": 0.8626776933670044, + "num_tokens": 70971141.0, + "step": 1857 + }, + { + "epoch": 0.2363566976211678, + "grad_norm": 1.55012845993042, + "learning_rate": 7.871979652395082e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.8433221578598022, + "num_tokens": 71011703.0, + "step": 1858 + }, + { + "epoch": 0.2364839078997583, + "grad_norm": 1.7556179761886597, + "learning_rate": 7.876218736752861e-07, + "loss": 0.483, + "mean_token_accuracy": 0.8414084911346436, + "num_tokens": 71046386.0, + "step": 1859 + }, + { + "epoch": 0.23661111817834882, + "grad_norm": 1.7782038450241089, + "learning_rate": 7.88045782111064e-07, + "loss": 0.4755, + "mean_token_accuracy": 0.8422502279281616, + "num_tokens": 71084741.0, + "step": 1860 + }, + { + "epoch": 0.23673832845693932, + "grad_norm": 1.4569846391677856, + "learning_rate": 7.884696905468418e-07, + "loss": 0.4009, + "mean_token_accuracy": 0.8650497198104858, + "num_tokens": 71128456.0, + "step": 1861 + }, + { + "epoch": 0.23686553873552982, + "grad_norm": 1.6676342487335205, + "learning_rate": 7.888935989826198e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.8393601775169373, + "num_tokens": 71165779.0, + "step": 1862 + }, + { + "epoch": 0.23699274901412035, + "grad_norm": 1.575071096420288, + "learning_rate": 7.893175074183976e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.8318875432014465, + "num_tokens": 71207611.0, + "step": 1863 + }, + { + "epoch": 0.23711995929271085, + "grad_norm": 1.5244407653808594, + "learning_rate": 7.897414158541754e-07, + "loss": 0.4516, + "mean_token_accuracy": 0.8532261848449707, + "num_tokens": 71246104.0, + "step": 1864 + }, + { + "epoch": 0.23724716957130135, + "grad_norm": 1.513979434967041, + "learning_rate": 7.901653242899533e-07, + "loss": 0.3894, + "mean_token_accuracy": 0.8635441660881042, + "num_tokens": 71284182.0, + "step": 1865 + }, + { + "epoch": 0.23737437984989188, + "grad_norm": 1.540257215499878, + "learning_rate": 7.905892327257312e-07, + "loss": 0.4546, + "mean_token_accuracy": 0.8495071530342102, + "num_tokens": 71323037.0, + "step": 1866 + }, + { + "epoch": 0.23750159012848238, + "grad_norm": 1.5612549781799316, + "learning_rate": 7.910131411615091e-07, + "loss": 0.4467, + "mean_token_accuracy": 0.8555794954299927, + "num_tokens": 71360240.0, + "step": 1867 + }, + { + "epoch": 0.2376288004070729, + "grad_norm": 1.57931387424469, + "learning_rate": 7.91437049597287e-07, + "loss": 0.4257, + "mean_token_accuracy": 0.8553152680397034, + "num_tokens": 71396564.0, + "step": 1868 + }, + { + "epoch": 0.2377560106856634, + "grad_norm": 1.6487023830413818, + "learning_rate": 7.918609580330648e-07, + "loss": 0.3938, + "mean_token_accuracy": 0.8627662062644958, + "num_tokens": 71426328.0, + "step": 1869 + }, + { + "epoch": 0.2378832209642539, + "grad_norm": 1.6078368425369263, + "learning_rate": 7.922848664688428e-07, + "loss": 0.4251, + "mean_token_accuracy": 0.8555279970169067, + "num_tokens": 71463884.0, + "step": 1870 + }, + { + "epoch": 0.23801043124284443, + "grad_norm": 1.4608266353607178, + "learning_rate": 7.927087749046205e-07, + "loss": 0.4102, + "mean_token_accuracy": 0.8599658012390137, + "num_tokens": 71503009.0, + "step": 1871 + }, + { + "epoch": 0.23813764152143493, + "grad_norm": 1.5174527168273926, + "learning_rate": 7.931326833403983e-07, + "loss": 0.454, + "mean_token_accuracy": 0.85505610704422, + "num_tokens": 71544293.0, + "step": 1872 + }, + { + "epoch": 0.23826485180002543, + "grad_norm": 1.6963615417480469, + "learning_rate": 7.935565917761763e-07, + "loss": 0.474, + "mean_token_accuracy": 0.8436283469200134, + "num_tokens": 71578801.0, + "step": 1873 + }, + { + "epoch": 0.23839206207861596, + "grad_norm": 1.5519603490829468, + "learning_rate": 7.939805002119541e-07, + "loss": 0.4015, + "mean_token_accuracy": 0.8609479665756226, + "num_tokens": 71617263.0, + "step": 1874 + }, + { + "epoch": 0.23851927235720646, + "grad_norm": 1.4977784156799316, + "learning_rate": 7.944044086477321e-07, + "loss": 0.3758, + "mean_token_accuracy": 0.8723452091217041, + "num_tokens": 71656667.0, + "step": 1875 + }, + { + "epoch": 0.23864648263579696, + "grad_norm": 1.6208715438842773, + "learning_rate": 7.948283170835099e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.8558305501937866, + "num_tokens": 71696255.0, + "step": 1876 + }, + { + "epoch": 0.2387736929143875, + "grad_norm": 1.6135809421539307, + "learning_rate": 7.952522255192878e-07, + "loss": 0.4078, + "mean_token_accuracy": 0.8640583753585815, + "num_tokens": 71730935.0, + "step": 1877 + }, + { + "epoch": 0.238900903192978, + "grad_norm": 1.496525764465332, + "learning_rate": 7.956761339550657e-07, + "loss": 0.4249, + "mean_token_accuracy": 0.8577167987823486, + "num_tokens": 71774487.0, + "step": 1878 + }, + { + "epoch": 0.2390281134715685, + "grad_norm": 1.6970666646957397, + "learning_rate": 7.961000423908435e-07, + "loss": 0.4477, + "mean_token_accuracy": 0.8477344512939453, + "num_tokens": 71808477.0, + "step": 1879 + }, + { + "epoch": 0.23915532375015902, + "grad_norm": 1.567219614982605, + "learning_rate": 7.965239508266214e-07, + "loss": 0.4352, + "mean_token_accuracy": 0.8535581231117249, + "num_tokens": 71848507.0, + "step": 1880 + }, + { + "epoch": 0.23928253402874952, + "grad_norm": 1.5123566389083862, + "learning_rate": 7.969478592623993e-07, + "loss": 0.4425, + "mean_token_accuracy": 0.8476231098175049, + "num_tokens": 71889405.0, + "step": 1881 + }, + { + "epoch": 0.23940974430734002, + "grad_norm": 1.5755845308303833, + "learning_rate": 7.973717676981771e-07, + "loss": 0.4569, + "mean_token_accuracy": 0.8445650935173035, + "num_tokens": 71931033.0, + "step": 1882 + }, + { + "epoch": 0.23953695458593055, + "grad_norm": 1.7064027786254883, + "learning_rate": 7.977956761339551e-07, + "loss": 0.4338, + "mean_token_accuracy": 0.8548431396484375, + "num_tokens": 71961145.0, + "step": 1883 + }, + { + "epoch": 0.23966416486452105, + "grad_norm": 1.5817869901657104, + "learning_rate": 7.982195845697329e-07, + "loss": 0.3978, + "mean_token_accuracy": 0.8664753437042236, + "num_tokens": 71995502.0, + "step": 1884 + }, + { + "epoch": 0.23979137514311155, + "grad_norm": 1.650903582572937, + "learning_rate": 7.986434930055108e-07, + "loss": 0.4533, + "mean_token_accuracy": 0.8474611043930054, + "num_tokens": 72031102.0, + "step": 1885 + }, + { + "epoch": 0.23991858542170208, + "grad_norm": 1.4914122819900513, + "learning_rate": 7.990674014412886e-07, + "loss": 0.4155, + "mean_token_accuracy": 0.862303614616394, + "num_tokens": 72071275.0, + "step": 1886 + }, + { + "epoch": 0.24004579570029258, + "grad_norm": 1.8574732542037964, + "learning_rate": 7.994913098770665e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8461378812789917, + "num_tokens": 72101036.0, + "step": 1887 + }, + { + "epoch": 0.24017300597888308, + "grad_norm": 1.5047589540481567, + "learning_rate": 7.999152183128444e-07, + "loss": 0.4427, + "mean_token_accuracy": 0.8553470373153687, + "num_tokens": 72140977.0, + "step": 1888 + }, + { + "epoch": 0.2403002162574736, + "grad_norm": 1.5614874362945557, + "learning_rate": 8.003391267486223e-07, + "loss": 0.4174, + "mean_token_accuracy": 0.858543872833252, + "num_tokens": 72175155.0, + "step": 1889 + }, + { + "epoch": 0.2404274265360641, + "grad_norm": 1.5815047025680542, + "learning_rate": 8.007630351844001e-07, + "loss": 0.4127, + "mean_token_accuracy": 0.8597038984298706, + "num_tokens": 72212799.0, + "step": 1890 + }, + { + "epoch": 0.24055463681465464, + "grad_norm": 1.5636107921600342, + "learning_rate": 8.011869436201781e-07, + "loss": 0.4299, + "mean_token_accuracy": 0.8550803065299988, + "num_tokens": 72247820.0, + "step": 1891 + }, + { + "epoch": 0.24068184709324514, + "grad_norm": 1.7555103302001953, + "learning_rate": 8.016108520559559e-07, + "loss": 0.4251, + "mean_token_accuracy": 0.8615410327911377, + "num_tokens": 72278349.0, + "step": 1892 + }, + { + "epoch": 0.24080905737183564, + "grad_norm": 1.5394845008850098, + "learning_rate": 8.020347604917338e-07, + "loss": 0.4382, + "mean_token_accuracy": 0.8596312999725342, + "num_tokens": 72318638.0, + "step": 1893 + }, + { + "epoch": 0.24093626765042617, + "grad_norm": 1.6044673919677734, + "learning_rate": 8.024586689275116e-07, + "loss": 0.4184, + "mean_token_accuracy": 0.8554401993751526, + "num_tokens": 72354653.0, + "step": 1894 + }, + { + "epoch": 0.24106347792901667, + "grad_norm": 1.7169593572616577, + "learning_rate": 8.028825773632894e-07, + "loss": 0.4498, + "mean_token_accuracy": 0.8488898277282715, + "num_tokens": 72386069.0, + "step": 1895 + }, + { + "epoch": 0.24119068820760717, + "grad_norm": 1.5438107252120972, + "learning_rate": 8.033064857990674e-07, + "loss": 0.4334, + "mean_token_accuracy": 0.8530402183532715, + "num_tokens": 72423280.0, + "step": 1896 + }, + { + "epoch": 0.2413178984861977, + "grad_norm": 1.566105842590332, + "learning_rate": 8.037303942348452e-07, + "loss": 0.4613, + "mean_token_accuracy": 0.8475019931793213, + "num_tokens": 72461482.0, + "step": 1897 + }, + { + "epoch": 0.2414451087647882, + "grad_norm": 1.6429224014282227, + "learning_rate": 8.041543026706231e-07, + "loss": 0.4095, + "mean_token_accuracy": 0.863614559173584, + "num_tokens": 72494734.0, + "step": 1898 + }, + { + "epoch": 0.2415723190433787, + "grad_norm": 1.6649209260940552, + "learning_rate": 8.04578211106401e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.8432598114013672, + "num_tokens": 72536148.0, + "step": 1899 + }, + { + "epoch": 0.24169952932196923, + "grad_norm": 1.5608407258987427, + "learning_rate": 8.050021195421789e-07, + "loss": 0.4794, + "mean_token_accuracy": 0.8436550498008728, + "num_tokens": 72573056.0, + "step": 1900 + }, + { + "epoch": 0.24182673960055973, + "grad_norm": 1.5186975002288818, + "learning_rate": 8.054260279779567e-07, + "loss": 0.414, + "mean_token_accuracy": 0.8575100898742676, + "num_tokens": 72610351.0, + "step": 1901 + }, + { + "epoch": 0.24195394987915023, + "grad_norm": 1.47825026512146, + "learning_rate": 8.058499364137346e-07, + "loss": 0.3959, + "mean_token_accuracy": 0.8665469884872437, + "num_tokens": 72645782.0, + "step": 1902 + }, + { + "epoch": 0.24208116015774075, + "grad_norm": 1.599436640739441, + "learning_rate": 8.062738448495124e-07, + "loss": 0.4097, + "mean_token_accuracy": 0.8626146912574768, + "num_tokens": 72679174.0, + "step": 1903 + }, + { + "epoch": 0.24220837043633126, + "grad_norm": 1.5388208627700806, + "learning_rate": 8.066977532852904e-07, + "loss": 0.4191, + "mean_token_accuracy": 0.8587009906768799, + "num_tokens": 72719712.0, + "step": 1904 + }, + { + "epoch": 0.24233558071492176, + "grad_norm": 1.6694039106369019, + "learning_rate": 8.071216617210682e-07, + "loss": 0.4664, + "mean_token_accuracy": 0.8436781764030457, + "num_tokens": 72755752.0, + "step": 1905 + }, + { + "epoch": 0.24246279099351228, + "grad_norm": 1.5321587324142456, + "learning_rate": 8.075455701568461e-07, + "loss": 0.4437, + "mean_token_accuracy": 0.8491693735122681, + "num_tokens": 72791906.0, + "step": 1906 + }, + { + "epoch": 0.24259000127210278, + "grad_norm": 1.6060912609100342, + "learning_rate": 8.07969478592624e-07, + "loss": 0.4487, + "mean_token_accuracy": 0.8491287231445312, + "num_tokens": 72826581.0, + "step": 1907 + }, + { + "epoch": 0.24271721155069328, + "grad_norm": 1.769904613494873, + "learning_rate": 8.083933870284019e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.8356328010559082, + "num_tokens": 72859504.0, + "step": 1908 + }, + { + "epoch": 0.2428444218292838, + "grad_norm": 1.655145287513733, + "learning_rate": 8.088172954641796e-07, + "loss": 0.4182, + "mean_token_accuracy": 0.8622973561286926, + "num_tokens": 72891348.0, + "step": 1909 + }, + { + "epoch": 0.2429716321078743, + "grad_norm": 1.4240410327911377, + "learning_rate": 8.092412038999576e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.8397098183631897, + "num_tokens": 72934241.0, + "step": 1910 + }, + { + "epoch": 0.24309884238646481, + "grad_norm": 1.487655520439148, + "learning_rate": 8.096651123357354e-07, + "loss": 0.3983, + "mean_token_accuracy": 0.8649374842643738, + "num_tokens": 72972193.0, + "step": 1911 + }, + { + "epoch": 0.24322605266505534, + "grad_norm": 1.4810317754745483, + "learning_rate": 8.100890207715134e-07, + "loss": 0.4097, + "mean_token_accuracy": 0.8592631220817566, + "num_tokens": 73008933.0, + "step": 1912 + }, + { + "epoch": 0.24335326294364584, + "grad_norm": 1.448461651802063, + "learning_rate": 8.105129292072912e-07, + "loss": 0.3902, + "mean_token_accuracy": 0.8666009902954102, + "num_tokens": 73048205.0, + "step": 1913 + }, + { + "epoch": 0.24348047322223634, + "grad_norm": 1.596766710281372, + "learning_rate": 8.10936837643069e-07, + "loss": 0.4693, + "mean_token_accuracy": 0.8466974496841431, + "num_tokens": 73086606.0, + "step": 1914 + }, + { + "epoch": 0.24360768350082687, + "grad_norm": 1.5653855800628662, + "learning_rate": 8.11360746078847e-07, + "loss": 0.4307, + "mean_token_accuracy": 0.8578944206237793, + "num_tokens": 73125278.0, + "step": 1915 + }, + { + "epoch": 0.24373489377941737, + "grad_norm": 1.610997200012207, + "learning_rate": 8.117846545146248e-07, + "loss": 0.431, + "mean_token_accuracy": 0.855222225189209, + "num_tokens": 73163158.0, + "step": 1916 + }, + { + "epoch": 0.2438621040580079, + "grad_norm": 1.5859981775283813, + "learning_rate": 8.122085629504026e-07, + "loss": 0.4421, + "mean_token_accuracy": 0.8550748825073242, + "num_tokens": 73199180.0, + "step": 1917 + }, + { + "epoch": 0.2439893143365984, + "grad_norm": 1.5251941680908203, + "learning_rate": 8.126324713861805e-07, + "loss": 0.4997, + "mean_token_accuracy": 0.8328573107719421, + "num_tokens": 73238426.0, + "step": 1918 + }, + { + "epoch": 0.2441165246151889, + "grad_norm": 1.5119857788085938, + "learning_rate": 8.130563798219584e-07, + "loss": 0.4115, + "mean_token_accuracy": 0.859013557434082, + "num_tokens": 73273722.0, + "step": 1919 + }, + { + "epoch": 0.24424373489377943, + "grad_norm": 1.7207646369934082, + "learning_rate": 8.134802882577363e-07, + "loss": 0.4306, + "mean_token_accuracy": 0.8542232513427734, + "num_tokens": 73305824.0, + "step": 1920 + }, + { + "epoch": 0.24437094517236993, + "grad_norm": 1.5115786790847778, + "learning_rate": 8.139041966935142e-07, + "loss": 0.4364, + "mean_token_accuracy": 0.856432318687439, + "num_tokens": 73348146.0, + "step": 1921 + }, + { + "epoch": 0.24449815545096043, + "grad_norm": 1.5735726356506348, + "learning_rate": 8.14328105129292e-07, + "loss": 0.4608, + "mean_token_accuracy": 0.844584584236145, + "num_tokens": 73384439.0, + "step": 1922 + }, + { + "epoch": 0.24462536572955096, + "grad_norm": 1.7096189260482788, + "learning_rate": 8.1475201356507e-07, + "loss": 0.5244, + "mean_token_accuracy": 0.8313887119293213, + "num_tokens": 73421245.0, + "step": 1923 + }, + { + "epoch": 0.24475257600814146, + "grad_norm": 1.5058088302612305, + "learning_rate": 8.151759220008477e-07, + "loss": 0.4334, + "mean_token_accuracy": 0.8576878905296326, + "num_tokens": 73465371.0, + "step": 1924 + }, + { + "epoch": 0.24487978628673196, + "grad_norm": 1.596381425857544, + "learning_rate": 8.155998304366256e-07, + "loss": 0.4174, + "mean_token_accuracy": 0.8582149744033813, + "num_tokens": 73502525.0, + "step": 1925 + }, + { + "epoch": 0.2450069965653225, + "grad_norm": 1.5261896848678589, + "learning_rate": 8.160237388724035e-07, + "loss": 0.4017, + "mean_token_accuracy": 0.8642653822898865, + "num_tokens": 73537092.0, + "step": 1926 + }, + { + "epoch": 0.245134206843913, + "grad_norm": 1.6044871807098389, + "learning_rate": 8.164476473081814e-07, + "loss": 0.431, + "mean_token_accuracy": 0.8557811975479126, + "num_tokens": 73574696.0, + "step": 1927 + }, + { + "epoch": 0.2452614171225035, + "grad_norm": 1.428430438041687, + "learning_rate": 8.168715557439593e-07, + "loss": 0.4082, + "mean_token_accuracy": 0.8614903092384338, + "num_tokens": 73615188.0, + "step": 1928 + }, + { + "epoch": 0.24538862740109402, + "grad_norm": 1.628005027770996, + "learning_rate": 8.172954641797372e-07, + "loss": 0.4215, + "mean_token_accuracy": 0.8584393262863159, + "num_tokens": 73651544.0, + "step": 1929 + }, + { + "epoch": 0.24551583767968452, + "grad_norm": 1.616933822631836, + "learning_rate": 8.17719372615515e-07, + "loss": 0.4558, + "mean_token_accuracy": 0.8473901748657227, + "num_tokens": 73687689.0, + "step": 1930 + }, + { + "epoch": 0.24564304795827502, + "grad_norm": 1.6466401815414429, + "learning_rate": 8.18143281051293e-07, + "loss": 0.5013, + "mean_token_accuracy": 0.8306697607040405, + "num_tokens": 73725871.0, + "step": 1931 + }, + { + "epoch": 0.24577025823686555, + "grad_norm": 1.6008912324905396, + "learning_rate": 8.185671894870707e-07, + "loss": 0.3669, + "mean_token_accuracy": 0.8725762367248535, + "num_tokens": 73758779.0, + "step": 1932 + }, + { + "epoch": 0.24589746851545605, + "grad_norm": 1.4822584390640259, + "learning_rate": 8.189910979228485e-07, + "loss": 0.4557, + "mean_token_accuracy": 0.8483914136886597, + "num_tokens": 73801372.0, + "step": 1933 + }, + { + "epoch": 0.24602467879404655, + "grad_norm": 1.4548494815826416, + "learning_rate": 8.194150063586265e-07, + "loss": 0.4326, + "mean_token_accuracy": 0.8552829623222351, + "num_tokens": 73845976.0, + "step": 1934 + }, + { + "epoch": 0.24615188907263708, + "grad_norm": 1.5495684146881104, + "learning_rate": 8.198389147944043e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.8347775936126709, + "num_tokens": 73886124.0, + "step": 1935 + }, + { + "epoch": 0.24627909935122758, + "grad_norm": 1.542669415473938, + "learning_rate": 8.202628232301823e-07, + "loss": 0.4346, + "mean_token_accuracy": 0.8544921278953552, + "num_tokens": 73922487.0, + "step": 1936 + }, + { + "epoch": 0.24640630962981808, + "grad_norm": 1.5368075370788574, + "learning_rate": 8.206867316659601e-07, + "loss": 0.444, + "mean_token_accuracy": 0.8507298231124878, + "num_tokens": 73962659.0, + "step": 1937 + }, + { + "epoch": 0.2465335199084086, + "grad_norm": 1.4509847164154053, + "learning_rate": 8.21110640101738e-07, + "loss": 0.4468, + "mean_token_accuracy": 0.848940372467041, + "num_tokens": 74006430.0, + "step": 1938 + }, + { + "epoch": 0.2466607301869991, + "grad_norm": 1.4791244268417358, + "learning_rate": 8.215345485375159e-07, + "loss": 0.3793, + "mean_token_accuracy": 0.870343804359436, + "num_tokens": 74043157.0, + "step": 1939 + }, + { + "epoch": 0.2467879404655896, + "grad_norm": 1.5161811113357544, + "learning_rate": 8.219584569732937e-07, + "loss": 0.4266, + "mean_token_accuracy": 0.8550604581832886, + "num_tokens": 74082596.0, + "step": 1940 + }, + { + "epoch": 0.24691515074418013, + "grad_norm": 1.4496008157730103, + "learning_rate": 8.223823654090715e-07, + "loss": 0.3876, + "mean_token_accuracy": 0.8691995143890381, + "num_tokens": 74122979.0, + "step": 1941 + }, + { + "epoch": 0.24704236102277063, + "grad_norm": 1.5574798583984375, + "learning_rate": 8.228062738448495e-07, + "loss": 0.4755, + "mean_token_accuracy": 0.8380191326141357, + "num_tokens": 74160866.0, + "step": 1942 + }, + { + "epoch": 0.24716957130136116, + "grad_norm": 1.5806595087051392, + "learning_rate": 8.232301822806273e-07, + "loss": 0.4405, + "mean_token_accuracy": 0.8505376577377319, + "num_tokens": 74196841.0, + "step": 1943 + }, + { + "epoch": 0.24729678157995166, + "grad_norm": 1.4480572938919067, + "learning_rate": 8.236540907164053e-07, + "loss": 0.4006, + "mean_token_accuracy": 0.8645886182785034, + "num_tokens": 74240800.0, + "step": 1944 + }, + { + "epoch": 0.24742399185854216, + "grad_norm": 1.6142404079437256, + "learning_rate": 8.240779991521831e-07, + "loss": 0.4576, + "mean_token_accuracy": 0.8486844301223755, + "num_tokens": 74277593.0, + "step": 1945 + }, + { + "epoch": 0.2475512021371327, + "grad_norm": 1.4878989458084106, + "learning_rate": 8.24501907587961e-07, + "loss": 0.4127, + "mean_token_accuracy": 0.8611570596694946, + "num_tokens": 74320353.0, + "step": 1946 + }, + { + "epoch": 0.2476784124157232, + "grad_norm": 1.604771614074707, + "learning_rate": 8.249258160237388e-07, + "loss": 0.4475, + "mean_token_accuracy": 0.8521640300750732, + "num_tokens": 74357820.0, + "step": 1947 + }, + { + "epoch": 0.2478056226943137, + "grad_norm": 1.5116310119628906, + "learning_rate": 8.253497244595167e-07, + "loss": 0.4408, + "mean_token_accuracy": 0.850303053855896, + "num_tokens": 74399490.0, + "step": 1948 + }, + { + "epoch": 0.24793283297290422, + "grad_norm": 1.5726594924926758, + "learning_rate": 8.257736328952945e-07, + "loss": 0.3955, + "mean_token_accuracy": 0.8646522164344788, + "num_tokens": 74436750.0, + "step": 1949 + }, + { + "epoch": 0.24806004325149472, + "grad_norm": 1.5176167488098145, + "learning_rate": 8.261975413310725e-07, + "loss": 0.415, + "mean_token_accuracy": 0.8613244295120239, + "num_tokens": 74473918.0, + "step": 1950 + }, + { + "epoch": 0.24818725353008522, + "grad_norm": 1.6378690004348755, + "learning_rate": 8.266214497668503e-07, + "loss": 0.3951, + "mean_token_accuracy": 0.8656111359596252, + "num_tokens": 74508675.0, + "step": 1951 + }, + { + "epoch": 0.24831446380867575, + "grad_norm": 1.5411179065704346, + "learning_rate": 8.270453582026283e-07, + "loss": 0.4331, + "mean_token_accuracy": 0.8569990396499634, + "num_tokens": 74549856.0, + "step": 1952 + }, + { + "epoch": 0.24844167408726625, + "grad_norm": 1.509775996208191, + "learning_rate": 8.274692666384061e-07, + "loss": 0.4352, + "mean_token_accuracy": 0.8500062227249146, + "num_tokens": 74587977.0, + "step": 1953 + }, + { + "epoch": 0.24856888436585675, + "grad_norm": 1.5470348596572876, + "learning_rate": 8.27893175074184e-07, + "loss": 0.4613, + "mean_token_accuracy": 0.8491115570068359, + "num_tokens": 74623964.0, + "step": 1954 + }, + { + "epoch": 0.24869609464444728, + "grad_norm": 1.4496455192565918, + "learning_rate": 8.283170835099618e-07, + "loss": 0.3691, + "mean_token_accuracy": 0.8742599487304688, + "num_tokens": 74663293.0, + "step": 1955 + }, + { + "epoch": 0.24882330492303778, + "grad_norm": 1.5411932468414307, + "learning_rate": 8.287409919457396e-07, + "loss": 0.4303, + "mean_token_accuracy": 0.8532511591911316, + "num_tokens": 74702345.0, + "step": 1956 + }, + { + "epoch": 0.24895051520162828, + "grad_norm": 1.5228229761123657, + "learning_rate": 8.291649003815175e-07, + "loss": 0.4457, + "mean_token_accuracy": 0.8498549461364746, + "num_tokens": 74740806.0, + "step": 1957 + }, + { + "epoch": 0.2490777254802188, + "grad_norm": 1.6836535930633545, + "learning_rate": 8.295888088172954e-07, + "loss": 0.4233, + "mean_token_accuracy": 0.8607507944107056, + "num_tokens": 74771511.0, + "step": 1958 + }, + { + "epoch": 0.2492049357588093, + "grad_norm": 1.615695595741272, + "learning_rate": 8.300127172530733e-07, + "loss": 0.4173, + "mean_token_accuracy": 0.8589744567871094, + "num_tokens": 74807725.0, + "step": 1959 + }, + { + "epoch": 0.2493321460373998, + "grad_norm": 1.4855189323425293, + "learning_rate": 8.304366256888512e-07, + "loss": 0.4247, + "mean_token_accuracy": 0.8575540781021118, + "num_tokens": 74848841.0, + "step": 1960 + }, + { + "epoch": 0.24945935631599034, + "grad_norm": 1.5734270811080933, + "learning_rate": 8.308605341246291e-07, + "loss": 0.4312, + "mean_token_accuracy": 0.8506057858467102, + "num_tokens": 74887884.0, + "step": 1961 + }, + { + "epoch": 0.24958656659458084, + "grad_norm": 1.6177479028701782, + "learning_rate": 8.312844425604068e-07, + "loss": 0.4452, + "mean_token_accuracy": 0.8548716902732849, + "num_tokens": 74922876.0, + "step": 1962 + }, + { + "epoch": 0.24971377687317134, + "grad_norm": 1.565686821937561, + "learning_rate": 8.317083509961848e-07, + "loss": 0.4554, + "mean_token_accuracy": 0.8461058139801025, + "num_tokens": 74961688.0, + "step": 1963 + }, + { + "epoch": 0.24984098715176187, + "grad_norm": 1.3765839338302612, + "learning_rate": 8.321322594319626e-07, + "loss": 0.3781, + "mean_token_accuracy": 0.8697629570960999, + "num_tokens": 75008625.0, + "step": 1964 + }, + { + "epoch": 0.24996819743035237, + "grad_norm": 1.6137404441833496, + "learning_rate": 8.325561678677405e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8475719690322876, + "num_tokens": 75047248.0, + "step": 1965 + }, + { + "epoch": 0.2500954077089429, + "grad_norm": 1.759351372718811, + "learning_rate": 8.329800763035184e-07, + "loss": 0.4799, + "mean_token_accuracy": 0.8402236104011536, + "num_tokens": 75081897.0, + "step": 1966 + }, + { + "epoch": 0.25022261798753337, + "grad_norm": 1.5533387660980225, + "learning_rate": 8.334039847392963e-07, + "loss": 0.4448, + "mean_token_accuracy": 0.8466810584068298, + "num_tokens": 75119687.0, + "step": 1967 + }, + { + "epoch": 0.2503498282661239, + "grad_norm": 1.5634965896606445, + "learning_rate": 8.338278931750742e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8486598134040833, + "num_tokens": 75157160.0, + "step": 1968 + }, + { + "epoch": 0.2504770385447144, + "grad_norm": 1.5308184623718262, + "learning_rate": 8.342518016108521e-07, + "loss": 0.4211, + "mean_token_accuracy": 0.8584433794021606, + "num_tokens": 75195232.0, + "step": 1969 + }, + { + "epoch": 0.2506042488233049, + "grad_norm": 1.547328233718872, + "learning_rate": 8.346757100466298e-07, + "loss": 0.4506, + "mean_token_accuracy": 0.8462771773338318, + "num_tokens": 75231481.0, + "step": 1970 + }, + { + "epoch": 0.2507314591018954, + "grad_norm": 1.8158574104309082, + "learning_rate": 8.350996184824078e-07, + "loss": 0.5115, + "mean_token_accuracy": 0.8316208124160767, + "num_tokens": 75265775.0, + "step": 1971 + }, + { + "epoch": 0.25085866938048595, + "grad_norm": 1.4499138593673706, + "learning_rate": 8.355235269181856e-07, + "loss": 0.4517, + "mean_token_accuracy": 0.8476565480232239, + "num_tokens": 75309391.0, + "step": 1972 + }, + { + "epoch": 0.2509858796590764, + "grad_norm": 1.719084620475769, + "learning_rate": 8.359474353539635e-07, + "loss": 0.4466, + "mean_token_accuracy": 0.8486625552177429, + "num_tokens": 75344033.0, + "step": 1973 + }, + { + "epoch": 0.25111308993766696, + "grad_norm": 1.5287468433380127, + "learning_rate": 8.363713437897414e-07, + "loss": 0.4194, + "mean_token_accuracy": 0.8587106466293335, + "num_tokens": 75382684.0, + "step": 1974 + }, + { + "epoch": 0.2512403002162575, + "grad_norm": 1.6439547538757324, + "learning_rate": 8.367952522255193e-07, + "loss": 0.418, + "mean_token_accuracy": 0.8612492084503174, + "num_tokens": 75417375.0, + "step": 1975 + }, + { + "epoch": 0.25136751049484796, + "grad_norm": 1.472784399986267, + "learning_rate": 8.372191606612972e-07, + "loss": 0.414, + "mean_token_accuracy": 0.8629676103591919, + "num_tokens": 75456992.0, + "step": 1976 + }, + { + "epoch": 0.2514947207734385, + "grad_norm": 1.4978116750717163, + "learning_rate": 8.376430690970749e-07, + "loss": 0.4027, + "mean_token_accuracy": 0.8622164726257324, + "num_tokens": 75492288.0, + "step": 1977 + }, + { + "epoch": 0.251621931052029, + "grad_norm": 1.6566646099090576, + "learning_rate": 8.380669775328528e-07, + "loss": 0.4513, + "mean_token_accuracy": 0.8490800857543945, + "num_tokens": 75526495.0, + "step": 1978 + }, + { + "epoch": 0.25174914133061954, + "grad_norm": 1.6065753698349, + "learning_rate": 8.384908859686307e-07, + "loss": 0.3924, + "mean_token_accuracy": 0.8659505248069763, + "num_tokens": 75564145.0, + "step": 1979 + }, + { + "epoch": 0.25187635160921, + "grad_norm": 1.5151053667068481, + "learning_rate": 8.389147944044086e-07, + "loss": 0.446, + "mean_token_accuracy": 0.8482342958450317, + "num_tokens": 75604406.0, + "step": 1980 + }, + { + "epoch": 0.25200356188780054, + "grad_norm": 1.4682443141937256, + "learning_rate": 8.393387028401864e-07, + "loss": 0.4244, + "mean_token_accuracy": 0.8612067699432373, + "num_tokens": 75648992.0, + "step": 1981 + }, + { + "epoch": 0.25213077216639107, + "grad_norm": 1.5428757667541504, + "learning_rate": 8.397626112759644e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.84868323802948, + "num_tokens": 75689573.0, + "step": 1982 + }, + { + "epoch": 0.25225798244498154, + "grad_norm": 1.6575276851654053, + "learning_rate": 8.401865197117422e-07, + "loss": 0.452, + "mean_token_accuracy": 0.8483078479766846, + "num_tokens": 75724798.0, + "step": 1983 + }, + { + "epoch": 0.25238519272357207, + "grad_norm": 2.0175743103027344, + "learning_rate": 8.406104281475202e-07, + "loss": 0.3956, + "mean_token_accuracy": 0.8638725876808167, + "num_tokens": 75758640.0, + "step": 1984 + }, + { + "epoch": 0.2525124030021626, + "grad_norm": 1.5335928201675415, + "learning_rate": 8.410343365832979e-07, + "loss": 0.422, + "mean_token_accuracy": 0.8616706728935242, + "num_tokens": 75800684.0, + "step": 1985 + }, + { + "epoch": 0.2526396132807531, + "grad_norm": 1.4790035486221313, + "learning_rate": 8.414582450190758e-07, + "loss": 0.4049, + "mean_token_accuracy": 0.8647480010986328, + "num_tokens": 75841086.0, + "step": 1986 + }, + { + "epoch": 0.2527668235593436, + "grad_norm": 1.6817264556884766, + "learning_rate": 8.418821534548537e-07, + "loss": 0.4669, + "mean_token_accuracy": 0.8437148928642273, + "num_tokens": 75877282.0, + "step": 1987 + }, + { + "epoch": 0.25289403383793413, + "grad_norm": 1.7215192317962646, + "learning_rate": 8.423060618906316e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.83916175365448, + "num_tokens": 75911668.0, + "step": 1988 + }, + { + "epoch": 0.2530212441165246, + "grad_norm": 1.6071436405181885, + "learning_rate": 8.427299703264095e-07, + "loss": 0.4369, + "mean_token_accuracy": 0.8518339395523071, + "num_tokens": 75950076.0, + "step": 1989 + }, + { + "epoch": 0.25314845439511513, + "grad_norm": 1.5082063674926758, + "learning_rate": 8.431538787621874e-07, + "loss": 0.4508, + "mean_token_accuracy": 0.849216878414154, + "num_tokens": 75993117.0, + "step": 1990 + }, + { + "epoch": 0.25327566467370566, + "grad_norm": 1.5936849117279053, + "learning_rate": 8.435777871979652e-07, + "loss": 0.4299, + "mean_token_accuracy": 0.857349157333374, + "num_tokens": 76030442.0, + "step": 1991 + }, + { + "epoch": 0.25340287495229613, + "grad_norm": 1.6706905364990234, + "learning_rate": 8.440016956337432e-07, + "loss": 0.4523, + "mean_token_accuracy": 0.8495567440986633, + "num_tokens": 76065457.0, + "step": 1992 + }, + { + "epoch": 0.25353008523088666, + "grad_norm": 1.6609808206558228, + "learning_rate": 8.444256040695209e-07, + "loss": 0.519, + "mean_token_accuracy": 0.8277314305305481, + "num_tokens": 76102586.0, + "step": 1993 + }, + { + "epoch": 0.2536572955094772, + "grad_norm": 1.4974294900894165, + "learning_rate": 8.448495125052988e-07, + "loss": 0.4774, + "mean_token_accuracy": 0.8428788185119629, + "num_tokens": 76144102.0, + "step": 1994 + }, + { + "epoch": 0.25378450578806766, + "grad_norm": 1.6795741319656372, + "learning_rate": 8.452734209410767e-07, + "loss": 0.3856, + "mean_token_accuracy": 0.8640614748001099, + "num_tokens": 76176689.0, + "step": 1995 + }, + { + "epoch": 0.2539117160666582, + "grad_norm": 1.489783525466919, + "learning_rate": 8.456973293768545e-07, + "loss": 0.4583, + "mean_token_accuracy": 0.8470658659934998, + "num_tokens": 76215587.0, + "step": 1996 + }, + { + "epoch": 0.2540389263452487, + "grad_norm": 1.7414672374725342, + "learning_rate": 8.461212378126325e-07, + "loss": 0.505, + "mean_token_accuracy": 0.8324731588363647, + "num_tokens": 76252488.0, + "step": 1997 + }, + { + "epoch": 0.2541661366238392, + "grad_norm": 1.7021806240081787, + "learning_rate": 8.465451462484103e-07, + "loss": 0.4627, + "mean_token_accuracy": 0.8453805446624756, + "num_tokens": 76286071.0, + "step": 1998 + }, + { + "epoch": 0.2542933469024297, + "grad_norm": 1.5422338247299194, + "learning_rate": 8.469690546841882e-07, + "loss": 0.4659, + "mean_token_accuracy": 0.8447139263153076, + "num_tokens": 76326691.0, + "step": 1999 + }, + { + "epoch": 0.25442055718102025, + "grad_norm": 1.429511308670044, + "learning_rate": 8.47392963119966e-07, + "loss": 0.4647, + "mean_token_accuracy": 0.8448790311813354, + "num_tokens": 76370703.0, + "step": 2000 + }, + { + "epoch": 0.2545477674596107, + "grad_norm": 1.421899437904358, + "learning_rate": 8.478168715557439e-07, + "loss": 0.4141, + "mean_token_accuracy": 0.8589937090873718, + "num_tokens": 76413844.0, + "step": 2001 + }, + { + "epoch": 0.25467497773820125, + "grad_norm": 1.5520416498184204, + "learning_rate": 8.482407799915217e-07, + "loss": 0.4343, + "mean_token_accuracy": 0.8571805953979492, + "num_tokens": 76451189.0, + "step": 2002 + }, + { + "epoch": 0.2548021880167918, + "grad_norm": 1.593909502029419, + "learning_rate": 8.486646884272997e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8459754586219788, + "num_tokens": 76490199.0, + "step": 2003 + }, + { + "epoch": 0.25492939829538225, + "grad_norm": 1.6301718950271606, + "learning_rate": 8.490885968630775e-07, + "loss": 0.4189, + "mean_token_accuracy": 0.8613803386688232, + "num_tokens": 76527107.0, + "step": 2004 + }, + { + "epoch": 0.2550566085739728, + "grad_norm": 1.6948356628417969, + "learning_rate": 8.495125052988555e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.8449198007583618, + "num_tokens": 76561906.0, + "step": 2005 + }, + { + "epoch": 0.2551838188525633, + "grad_norm": 1.5628583431243896, + "learning_rate": 8.499364137346333e-07, + "loss": 0.4156, + "mean_token_accuracy": 0.8567211627960205, + "num_tokens": 76603096.0, + "step": 2006 + }, + { + "epoch": 0.2553110291311538, + "grad_norm": 1.5856268405914307, + "learning_rate": 8.503603221704112e-07, + "loss": 0.4179, + "mean_token_accuracy": 0.8581769466400146, + "num_tokens": 76640935.0, + "step": 2007 + }, + { + "epoch": 0.2554382394097443, + "grad_norm": 1.5266908407211304, + "learning_rate": 8.50784230606189e-07, + "loss": 0.4545, + "mean_token_accuracy": 0.8493585586547852, + "num_tokens": 76680759.0, + "step": 2008 + }, + { + "epoch": 0.25556544968833483, + "grad_norm": 1.5217620134353638, + "learning_rate": 8.512081390419669e-07, + "loss": 0.4218, + "mean_token_accuracy": 0.861485481262207, + "num_tokens": 76720753.0, + "step": 2009 + }, + { + "epoch": 0.2556926599669253, + "grad_norm": 1.6013209819793701, + "learning_rate": 8.516320474777447e-07, + "loss": 0.4289, + "mean_token_accuracy": 0.8556517362594604, + "num_tokens": 76756934.0, + "step": 2010 + }, + { + "epoch": 0.25581987024551583, + "grad_norm": 1.485005497932434, + "learning_rate": 8.520559559135227e-07, + "loss": 0.3721, + "mean_token_accuracy": 0.8733780384063721, + "num_tokens": 76795131.0, + "step": 2011 + }, + { + "epoch": 0.25594708052410636, + "grad_norm": 1.5469474792480469, + "learning_rate": 8.524798643493005e-07, + "loss": 0.3715, + "mean_token_accuracy": 0.8722490072250366, + "num_tokens": 76836550.0, + "step": 2012 + }, + { + "epoch": 0.25607429080269684, + "grad_norm": 1.512603998184204, + "learning_rate": 8.529037727850785e-07, + "loss": 0.445, + "mean_token_accuracy": 0.8460925817489624, + "num_tokens": 76876908.0, + "step": 2013 + }, + { + "epoch": 0.25620150108128736, + "grad_norm": 1.5108827352523804, + "learning_rate": 8.533276812208563e-07, + "loss": 0.4604, + "mean_token_accuracy": 0.8502203226089478, + "num_tokens": 76917415.0, + "step": 2014 + }, + { + "epoch": 0.2563287113598779, + "grad_norm": 1.5667531490325928, + "learning_rate": 8.53751589656634e-07, + "loss": 0.4125, + "mean_token_accuracy": 0.8617795705795288, + "num_tokens": 76949010.0, + "step": 2015 + }, + { + "epoch": 0.25645592163846836, + "grad_norm": 1.5356463193893433, + "learning_rate": 8.54175498092412e-07, + "loss": 0.4274, + "mean_token_accuracy": 0.8562671542167664, + "num_tokens": 76990031.0, + "step": 2016 + }, + { + "epoch": 0.2565831319170589, + "grad_norm": 1.4507625102996826, + "learning_rate": 8.545994065281898e-07, + "loss": 0.4392, + "mean_token_accuracy": 0.8528128266334534, + "num_tokens": 77030117.0, + "step": 2017 + }, + { + "epoch": 0.2567103421956494, + "grad_norm": 1.5471254587173462, + "learning_rate": 8.550233149639677e-07, + "loss": 0.4051, + "mean_token_accuracy": 0.8629993200302124, + "num_tokens": 77069793.0, + "step": 2018 + }, + { + "epoch": 0.2568375524742399, + "grad_norm": 1.6769105195999146, + "learning_rate": 8.554472233997456e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8455272912979126, + "num_tokens": 77105332.0, + "step": 2019 + }, + { + "epoch": 0.2569647627528304, + "grad_norm": 1.4952200651168823, + "learning_rate": 8.558711318355235e-07, + "loss": 0.4181, + "mean_token_accuracy": 0.860055685043335, + "num_tokens": 77147678.0, + "step": 2020 + }, + { + "epoch": 0.25709197303142095, + "grad_norm": 1.6234205961227417, + "learning_rate": 8.562950402713014e-07, + "loss": 0.4035, + "mean_token_accuracy": 0.8630111217498779, + "num_tokens": 77183805.0, + "step": 2021 + }, + { + "epoch": 0.2572191833100114, + "grad_norm": 1.5063813924789429, + "learning_rate": 8.567189487070793e-07, + "loss": 0.4129, + "mean_token_accuracy": 0.8631116151809692, + "num_tokens": 77224899.0, + "step": 2022 + }, + { + "epoch": 0.25734639358860195, + "grad_norm": 1.504517674446106, + "learning_rate": 8.57142857142857e-07, + "loss": 0.4182, + "mean_token_accuracy": 0.8589996099472046, + "num_tokens": 77261351.0, + "step": 2023 + }, + { + "epoch": 0.2574736038671925, + "grad_norm": 1.5832252502441406, + "learning_rate": 8.57566765578635e-07, + "loss": 0.4385, + "mean_token_accuracy": 0.8559603691101074, + "num_tokens": 77295632.0, + "step": 2024 + }, + { + "epoch": 0.25760081414578295, + "grad_norm": 1.4867048263549805, + "learning_rate": 8.579906740144128e-07, + "loss": 0.4587, + "mean_token_accuracy": 0.8464056253433228, + "num_tokens": 77336614.0, + "step": 2025 + }, + { + "epoch": 0.2577280244243735, + "grad_norm": 1.449509859085083, + "learning_rate": 8.584145824501907e-07, + "loss": 0.3819, + "mean_token_accuracy": 0.868556797504425, + "num_tokens": 77376821.0, + "step": 2026 + }, + { + "epoch": 0.257855234702964, + "grad_norm": 1.495273470878601, + "learning_rate": 8.588384908859686e-07, + "loss": 0.3819, + "mean_token_accuracy": 0.8706170916557312, + "num_tokens": 77415098.0, + "step": 2027 + }, + { + "epoch": 0.25798244498155454, + "grad_norm": 1.6144821643829346, + "learning_rate": 8.592623993217465e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8487980365753174, + "num_tokens": 77453579.0, + "step": 2028 + }, + { + "epoch": 0.258109655260145, + "grad_norm": 1.5659338235855103, + "learning_rate": 8.596863077575244e-07, + "loss": 0.4104, + "mean_token_accuracy": 0.8623295426368713, + "num_tokens": 77495296.0, + "step": 2029 + }, + { + "epoch": 0.25823686553873554, + "grad_norm": 1.6052885055541992, + "learning_rate": 8.601102161933023e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8299360275268555, + "num_tokens": 77539262.0, + "step": 2030 + }, + { + "epoch": 0.25836407581732607, + "grad_norm": 1.6643149852752686, + "learning_rate": 8.6053412462908e-07, + "loss": 0.4375, + "mean_token_accuracy": 0.8538269400596619, + "num_tokens": 77576012.0, + "step": 2031 + }, + { + "epoch": 0.25849128609591654, + "grad_norm": 1.4905685186386108, + "learning_rate": 8.60958033064858e-07, + "loss": 0.4575, + "mean_token_accuracy": 0.8481372594833374, + "num_tokens": 77617587.0, + "step": 2032 + }, + { + "epoch": 0.25861849637450707, + "grad_norm": 1.558395504951477, + "learning_rate": 8.613819415006358e-07, + "loss": 0.3831, + "mean_token_accuracy": 0.8697128295898438, + "num_tokens": 77654211.0, + "step": 2033 + }, + { + "epoch": 0.2587457066530976, + "grad_norm": 1.6769367456436157, + "learning_rate": 8.618058499364137e-07, + "loss": 0.435, + "mean_token_accuracy": 0.8537541031837463, + "num_tokens": 77695956.0, + "step": 2034 + }, + { + "epoch": 0.25887291693168807, + "grad_norm": 1.6733334064483643, + "learning_rate": 8.622297583721916e-07, + "loss": 0.3833, + "mean_token_accuracy": 0.8709696531295776, + "num_tokens": 77728686.0, + "step": 2035 + }, + { + "epoch": 0.2590001272102786, + "grad_norm": 1.999414324760437, + "learning_rate": 8.626536668079695e-07, + "loss": 0.4336, + "mean_token_accuracy": 0.8506468534469604, + "num_tokens": 77759106.0, + "step": 2036 + }, + { + "epoch": 0.2591273374888691, + "grad_norm": 1.6160650253295898, + "learning_rate": 8.630775752437474e-07, + "loss": 0.4116, + "mean_token_accuracy": 0.8631359934806824, + "num_tokens": 77794031.0, + "step": 2037 + }, + { + "epoch": 0.2592545477674596, + "grad_norm": 1.6273068189620972, + "learning_rate": 8.635014836795251e-07, + "loss": 0.4313, + "mean_token_accuracy": 0.8535853624343872, + "num_tokens": 77828593.0, + "step": 2038 + }, + { + "epoch": 0.2593817580460501, + "grad_norm": 1.6747455596923828, + "learning_rate": 8.63925392115303e-07, + "loss": 0.4878, + "mean_token_accuracy": 0.8399366140365601, + "num_tokens": 77865118.0, + "step": 2039 + }, + { + "epoch": 0.25950896832464065, + "grad_norm": 1.60696542263031, + "learning_rate": 8.643493005510809e-07, + "loss": 0.417, + "mean_token_accuracy": 0.8597759008407593, + "num_tokens": 77902068.0, + "step": 2040 + }, + { + "epoch": 0.2596361786032311, + "grad_norm": 1.6888850927352905, + "learning_rate": 8.647732089868588e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8440716862678528, + "num_tokens": 77940973.0, + "step": 2041 + }, + { + "epoch": 0.25976338888182166, + "grad_norm": 1.486484408378601, + "learning_rate": 8.651971174226366e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.847535252571106, + "num_tokens": 77986191.0, + "step": 2042 + }, + { + "epoch": 0.2598905991604122, + "grad_norm": 1.6452434062957764, + "learning_rate": 8.656210258584146e-07, + "loss": 0.4746, + "mean_token_accuracy": 0.8461548089981079, + "num_tokens": 78024873.0, + "step": 2043 + }, + { + "epoch": 0.26001780943900266, + "grad_norm": 1.4531495571136475, + "learning_rate": 8.660449342941924e-07, + "loss": 0.4214, + "mean_token_accuracy": 0.8578689694404602, + "num_tokens": 78065633.0, + "step": 2044 + }, + { + "epoch": 0.2601450197175932, + "grad_norm": 1.5230872631072998, + "learning_rate": 8.664688427299704e-07, + "loss": 0.499, + "mean_token_accuracy": 0.837319016456604, + "num_tokens": 78111281.0, + "step": 2045 + }, + { + "epoch": 0.2602722299961837, + "grad_norm": 1.574419617652893, + "learning_rate": 8.668927511657481e-07, + "loss": 0.4442, + "mean_token_accuracy": 0.8532875180244446, + "num_tokens": 78147525.0, + "step": 2046 + }, + { + "epoch": 0.2603994402747742, + "grad_norm": 1.6368731260299683, + "learning_rate": 8.67316659601526e-07, + "loss": 0.4647, + "mean_token_accuracy": 0.8438924551010132, + "num_tokens": 78183850.0, + "step": 2047 + }, + { + "epoch": 0.2605266505533647, + "grad_norm": 1.7560837268829346, + "learning_rate": 8.677405680373039e-07, + "loss": 0.4449, + "mean_token_accuracy": 0.8528280258178711, + "num_tokens": 78220616.0, + "step": 2048 + }, + { + "epoch": 0.26065386083195524, + "grad_norm": 1.594367504119873, + "learning_rate": 8.681644764730818e-07, + "loss": 0.4152, + "mean_token_accuracy": 0.8594164252281189, + "num_tokens": 78258352.0, + "step": 2049 + }, + { + "epoch": 0.2607810711105457, + "grad_norm": 1.5475807189941406, + "learning_rate": 8.685883849088596e-07, + "loss": 0.3944, + "mean_token_accuracy": 0.8653489947319031, + "num_tokens": 78292773.0, + "step": 2050 + }, + { + "epoch": 0.26090828138913624, + "grad_norm": 1.8093734979629517, + "learning_rate": 8.690122933446376e-07, + "loss": 0.4839, + "mean_token_accuracy": 0.83970707654953, + "num_tokens": 78324060.0, + "step": 2051 + }, + { + "epoch": 0.26103549166772677, + "grad_norm": 1.6668553352355957, + "learning_rate": 8.694362017804154e-07, + "loss": 0.4127, + "mean_token_accuracy": 0.8601660132408142, + "num_tokens": 78356401.0, + "step": 2052 + }, + { + "epoch": 0.26116270194631724, + "grad_norm": 1.5640971660614014, + "learning_rate": 8.698601102161933e-07, + "loss": 0.409, + "mean_token_accuracy": 0.8609746694564819, + "num_tokens": 78393135.0, + "step": 2053 + }, + { + "epoch": 0.26128991222490777, + "grad_norm": 1.5303239822387695, + "learning_rate": 8.702840186519711e-07, + "loss": 0.4449, + "mean_token_accuracy": 0.850788950920105, + "num_tokens": 78431847.0, + "step": 2054 + }, + { + "epoch": 0.2614171225034983, + "grad_norm": 1.536037564277649, + "learning_rate": 8.70707927087749e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.8377015590667725, + "num_tokens": 78476701.0, + "step": 2055 + }, + { + "epoch": 0.2615443327820888, + "grad_norm": 1.530829668045044, + "learning_rate": 8.711318355235269e-07, + "loss": 0.3712, + "mean_token_accuracy": 0.8718278408050537, + "num_tokens": 78509911.0, + "step": 2056 + }, + { + "epoch": 0.2616715430606793, + "grad_norm": 1.608420968055725, + "learning_rate": 8.715557439593047e-07, + "loss": 0.4074, + "mean_token_accuracy": 0.8628981113433838, + "num_tokens": 78547405.0, + "step": 2057 + }, + { + "epoch": 0.26179875333926983, + "grad_norm": 1.5487161874771118, + "learning_rate": 8.719796523950826e-07, + "loss": 0.4229, + "mean_token_accuracy": 0.8578006029129028, + "num_tokens": 78583991.0, + "step": 2058 + }, + { + "epoch": 0.2619259636178603, + "grad_norm": 1.4669194221496582, + "learning_rate": 8.724035608308605e-07, + "loss": 0.4514, + "mean_token_accuracy": 0.8491876125335693, + "num_tokens": 78624214.0, + "step": 2059 + }, + { + "epoch": 0.26205317389645083, + "grad_norm": 1.5595102310180664, + "learning_rate": 8.728274692666384e-07, + "loss": 0.4359, + "mean_token_accuracy": 0.8526425361633301, + "num_tokens": 78666607.0, + "step": 2060 + }, + { + "epoch": 0.26218038417504136, + "grad_norm": 1.497180461883545, + "learning_rate": 8.732513777024162e-07, + "loss": 0.4061, + "mean_token_accuracy": 0.8617802262306213, + "num_tokens": 78704631.0, + "step": 2061 + }, + { + "epoch": 0.26230759445363183, + "grad_norm": 1.4023445844650269, + "learning_rate": 8.736752861381941e-07, + "loss": 0.4065, + "mean_token_accuracy": 0.8629738092422485, + "num_tokens": 78751478.0, + "step": 2062 + }, + { + "epoch": 0.26243480473222236, + "grad_norm": 1.605607271194458, + "learning_rate": 8.740991945739719e-07, + "loss": 0.4393, + "mean_token_accuracy": 0.8500390648841858, + "num_tokens": 78789211.0, + "step": 2063 + }, + { + "epoch": 0.2625620150108129, + "grad_norm": 1.7023364305496216, + "learning_rate": 8.745231030097499e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8466358780860901, + "num_tokens": 78824184.0, + "step": 2064 + }, + { + "epoch": 0.26268922528940336, + "grad_norm": 1.4635372161865234, + "learning_rate": 8.749470114455277e-07, + "loss": 0.4488, + "mean_token_accuracy": 0.8516553640365601, + "num_tokens": 78871830.0, + "step": 2065 + }, + { + "epoch": 0.2628164355679939, + "grad_norm": 1.5444636344909668, + "learning_rate": 8.753709198813056e-07, + "loss": 0.4484, + "mean_token_accuracy": 0.8513501286506653, + "num_tokens": 78911880.0, + "step": 2066 + }, + { + "epoch": 0.2629436458465844, + "grad_norm": 1.4415812492370605, + "learning_rate": 8.757948283170835e-07, + "loss": 0.3824, + "mean_token_accuracy": 0.8714936971664429, + "num_tokens": 78953613.0, + "step": 2067 + }, + { + "epoch": 0.2630708561251749, + "grad_norm": 1.4642765522003174, + "learning_rate": 8.762187367528613e-07, + "loss": 0.4311, + "mean_token_accuracy": 0.8559106588363647, + "num_tokens": 78999736.0, + "step": 2068 + }, + { + "epoch": 0.2631980664037654, + "grad_norm": 1.5481199026107788, + "learning_rate": 8.766426451886392e-07, + "loss": 0.4628, + "mean_token_accuracy": 0.8453709483146667, + "num_tokens": 79038811.0, + "step": 2069 + }, + { + "epoch": 0.26332527668235595, + "grad_norm": 1.5343648195266724, + "learning_rate": 8.770665536244171e-07, + "loss": 0.3995, + "mean_token_accuracy": 0.8676291108131409, + "num_tokens": 79078956.0, + "step": 2070 + }, + { + "epoch": 0.2634524869609464, + "grad_norm": 1.6202479600906372, + "learning_rate": 8.774904620601949e-07, + "loss": 0.5116, + "mean_token_accuracy": 0.8336243033409119, + "num_tokens": 79119646.0, + "step": 2071 + }, + { + "epoch": 0.26357969723953695, + "grad_norm": 1.5347216129302979, + "learning_rate": 8.779143704959729e-07, + "loss": 0.4417, + "mean_token_accuracy": 0.8518869280815125, + "num_tokens": 79159531.0, + "step": 2072 + }, + { + "epoch": 0.2637069075181275, + "grad_norm": 1.5021986961364746, + "learning_rate": 8.783382789317507e-07, + "loss": 0.4588, + "mean_token_accuracy": 0.8485205769538879, + "num_tokens": 79200195.0, + "step": 2073 + }, + { + "epoch": 0.26383411779671795, + "grad_norm": 1.5603867769241333, + "learning_rate": 8.787621873675286e-07, + "loss": 0.4929, + "mean_token_accuracy": 0.836550235748291, + "num_tokens": 79243496.0, + "step": 2074 + }, + { + "epoch": 0.2639613280753085, + "grad_norm": 1.5064586400985718, + "learning_rate": 8.791860958033065e-07, + "loss": 0.4398, + "mean_token_accuracy": 0.8528491258621216, + "num_tokens": 79285239.0, + "step": 2075 + }, + { + "epoch": 0.264088538353899, + "grad_norm": 1.5535969734191895, + "learning_rate": 8.796100042390842e-07, + "loss": 0.4518, + "mean_token_accuracy": 0.8475726842880249, + "num_tokens": 79327601.0, + "step": 2076 + }, + { + "epoch": 0.2642157486324895, + "grad_norm": 1.533967137336731, + "learning_rate": 8.800339126748622e-07, + "loss": 0.4349, + "mean_token_accuracy": 0.8559743762016296, + "num_tokens": 79366348.0, + "step": 2077 + }, + { + "epoch": 0.26434295891108, + "grad_norm": 1.619920015335083, + "learning_rate": 8.8045782111064e-07, + "loss": 0.4346, + "mean_token_accuracy": 0.8530842065811157, + "num_tokens": 79401116.0, + "step": 2078 + }, + { + "epoch": 0.26447016918967053, + "grad_norm": 1.4852334260940552, + "learning_rate": 8.808817295464179e-07, + "loss": 0.4101, + "mean_token_accuracy": 0.8616918921470642, + "num_tokens": 79439306.0, + "step": 2079 + }, + { + "epoch": 0.26459737946826106, + "grad_norm": 1.7816511392593384, + "learning_rate": 8.813056379821958e-07, + "loss": 0.4657, + "mean_token_accuracy": 0.842382550239563, + "num_tokens": 79472724.0, + "step": 2080 + }, + { + "epoch": 0.26472458974685154, + "grad_norm": 1.5863151550292969, + "learning_rate": 8.817295464179737e-07, + "loss": 0.4593, + "mean_token_accuracy": 0.8491116762161255, + "num_tokens": 79513497.0, + "step": 2081 + }, + { + "epoch": 0.26485180002544206, + "grad_norm": 1.5529863834381104, + "learning_rate": 8.821534548537515e-07, + "loss": 0.4197, + "mean_token_accuracy": 0.8568233251571655, + "num_tokens": 79555921.0, + "step": 2082 + }, + { + "epoch": 0.2649790103040326, + "grad_norm": 1.638215184211731, + "learning_rate": 8.825773632895295e-07, + "loss": 0.4353, + "mean_token_accuracy": 0.8543765544891357, + "num_tokens": 79593275.0, + "step": 2083 + }, + { + "epoch": 0.26510622058262306, + "grad_norm": 1.561890721321106, + "learning_rate": 8.830012717253072e-07, + "loss": 0.4132, + "mean_token_accuracy": 0.8602689504623413, + "num_tokens": 79630127.0, + "step": 2084 + }, + { + "epoch": 0.2652334308612136, + "grad_norm": 1.5133402347564697, + "learning_rate": 8.834251801610852e-07, + "loss": 0.4612, + "mean_token_accuracy": 0.8454526662826538, + "num_tokens": 79671707.0, + "step": 2085 + }, + { + "epoch": 0.2653606411398041, + "grad_norm": 1.7335618734359741, + "learning_rate": 8.83849088596863e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.8353241682052612, + "num_tokens": 79707742.0, + "step": 2086 + }, + { + "epoch": 0.2654878514183946, + "grad_norm": 1.6190440654754639, + "learning_rate": 8.842729970326409e-07, + "loss": 0.4641, + "mean_token_accuracy": 0.8472321033477783, + "num_tokens": 79744949.0, + "step": 2087 + }, + { + "epoch": 0.2656150616969851, + "grad_norm": 1.6798113584518433, + "learning_rate": 8.846969054684188e-07, + "loss": 0.4738, + "mean_token_accuracy": 0.8435924053192139, + "num_tokens": 79782428.0, + "step": 2088 + }, + { + "epoch": 0.26574227197557565, + "grad_norm": 1.4303148984909058, + "learning_rate": 8.851208139041967e-07, + "loss": 0.4128, + "mean_token_accuracy": 0.8599367737770081, + "num_tokens": 79823947.0, + "step": 2089 + }, + { + "epoch": 0.2658694822541661, + "grad_norm": 1.689936637878418, + "learning_rate": 8.855447223399745e-07, + "loss": 0.4479, + "mean_token_accuracy": 0.849105954170227, + "num_tokens": 79859922.0, + "step": 2090 + }, + { + "epoch": 0.26599669253275665, + "grad_norm": 1.308359146118164, + "learning_rate": 8.859686307757524e-07, + "loss": 0.4036, + "mean_token_accuracy": 0.8630686402320862, + "num_tokens": 79910134.0, + "step": 2091 + }, + { + "epoch": 0.2661239028113472, + "grad_norm": 1.6433000564575195, + "learning_rate": 8.863925392115302e-07, + "loss": 0.4299, + "mean_token_accuracy": 0.8535194396972656, + "num_tokens": 79943741.0, + "step": 2092 + }, + { + "epoch": 0.26625111308993765, + "grad_norm": 1.535752773284912, + "learning_rate": 8.868164476473082e-07, + "loss": 0.4318, + "mean_token_accuracy": 0.8539068102836609, + "num_tokens": 79985465.0, + "step": 2093 + }, + { + "epoch": 0.2663783233685282, + "grad_norm": 1.6575075387954712, + "learning_rate": 8.87240356083086e-07, + "loss": 0.4519, + "mean_token_accuracy": 0.8484600782394409, + "num_tokens": 80021969.0, + "step": 2094 + }, + { + "epoch": 0.2665055336471187, + "grad_norm": 1.5677131414413452, + "learning_rate": 8.876642645188639e-07, + "loss": 0.4192, + "mean_token_accuracy": 0.8569244146347046, + "num_tokens": 80058216.0, + "step": 2095 + }, + { + "epoch": 0.2666327439257092, + "grad_norm": 1.5441763401031494, + "learning_rate": 8.880881729546418e-07, + "loss": 0.4274, + "mean_token_accuracy": 0.8521108627319336, + "num_tokens": 80095643.0, + "step": 2096 + }, + { + "epoch": 0.2667599542042997, + "grad_norm": 1.624497652053833, + "learning_rate": 8.885120813904197e-07, + "loss": 0.4418, + "mean_token_accuracy": 0.8492931127548218, + "num_tokens": 80132590.0, + "step": 2097 + }, + { + "epoch": 0.26688716448289024, + "grad_norm": 1.5780203342437744, + "learning_rate": 8.889359898261976e-07, + "loss": 0.4394, + "mean_token_accuracy": 0.8495402336120605, + "num_tokens": 80167867.0, + "step": 2098 + }, + { + "epoch": 0.2670143747614807, + "grad_norm": 1.8126070499420166, + "learning_rate": 8.893598982619753e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8463585376739502, + "num_tokens": 80200264.0, + "step": 2099 + }, + { + "epoch": 0.26714158504007124, + "grad_norm": 1.7071685791015625, + "learning_rate": 8.897838066977532e-07, + "loss": 0.4455, + "mean_token_accuracy": 0.8506154417991638, + "num_tokens": 80234022.0, + "step": 2100 + }, + { + "epoch": 0.26726879531866177, + "grad_norm": 1.623657464981079, + "learning_rate": 8.902077151335311e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8367726802825928, + "num_tokens": 80273918.0, + "step": 2101 + }, + { + "epoch": 0.26739600559725224, + "grad_norm": 1.665290117263794, + "learning_rate": 8.90631623569309e-07, + "loss": 0.4426, + "mean_token_accuracy": 0.8564995527267456, + "num_tokens": 80310401.0, + "step": 2102 + }, + { + "epoch": 0.26752321587584277, + "grad_norm": 1.4146897792816162, + "learning_rate": 8.910555320050868e-07, + "loss": 0.3863, + "mean_token_accuracy": 0.8695088624954224, + "num_tokens": 80351165.0, + "step": 2103 + }, + { + "epoch": 0.2676504261544333, + "grad_norm": 1.5036089420318604, + "learning_rate": 8.914794404408648e-07, + "loss": 0.4333, + "mean_token_accuracy": 0.8509414196014404, + "num_tokens": 80390345.0, + "step": 2104 + }, + { + "epoch": 0.26777763643302377, + "grad_norm": 1.4656461477279663, + "learning_rate": 8.919033488766426e-07, + "loss": 0.4181, + "mean_token_accuracy": 0.8584913015365601, + "num_tokens": 80433146.0, + "step": 2105 + }, + { + "epoch": 0.2679048467116143, + "grad_norm": 1.5196847915649414, + "learning_rate": 8.923272573124204e-07, + "loss": 0.4107, + "mean_token_accuracy": 0.8619739413261414, + "num_tokens": 80470380.0, + "step": 2106 + }, + { + "epoch": 0.2680320569902048, + "grad_norm": 1.5659364461898804, + "learning_rate": 8.927511657481983e-07, + "loss": 0.4038, + "mean_token_accuracy": 0.8635256290435791, + "num_tokens": 80504061.0, + "step": 2107 + }, + { + "epoch": 0.2681592672687953, + "grad_norm": 1.5992937088012695, + "learning_rate": 8.931750741839762e-07, + "loss": 0.4277, + "mean_token_accuracy": 0.8557741045951843, + "num_tokens": 80537797.0, + "step": 2108 + }, + { + "epoch": 0.2682864775473858, + "grad_norm": 1.4846227169036865, + "learning_rate": 8.935989826197541e-07, + "loss": 0.4506, + "mean_token_accuracy": 0.850357174873352, + "num_tokens": 80584893.0, + "step": 2109 + }, + { + "epoch": 0.26841368782597635, + "grad_norm": 1.633949875831604, + "learning_rate": 8.94022891055532e-07, + "loss": 0.4189, + "mean_token_accuracy": 0.8575466871261597, + "num_tokens": 80624005.0, + "step": 2110 + }, + { + "epoch": 0.2685408981045668, + "grad_norm": 1.380522608757019, + "learning_rate": 8.944467994913098e-07, + "loss": 0.4168, + "mean_token_accuracy": 0.8593507409095764, + "num_tokens": 80670011.0, + "step": 2111 + }, + { + "epoch": 0.26866810838315736, + "grad_norm": 1.5719127655029297, + "learning_rate": 8.948707079270878e-07, + "loss": 0.3936, + "mean_token_accuracy": 0.8662902116775513, + "num_tokens": 80705827.0, + "step": 2112 + }, + { + "epoch": 0.2687953186617479, + "grad_norm": 1.6387416124343872, + "learning_rate": 8.952946163628656e-07, + "loss": 0.3872, + "mean_token_accuracy": 0.868763267993927, + "num_tokens": 80737656.0, + "step": 2113 + }, + { + "epoch": 0.26892252894033836, + "grad_norm": 1.619920253753662, + "learning_rate": 8.957185247986434e-07, + "loss": 0.4547, + "mean_token_accuracy": 0.8464120626449585, + "num_tokens": 80776014.0, + "step": 2114 + }, + { + "epoch": 0.2690497392189289, + "grad_norm": 1.960033655166626, + "learning_rate": 8.961424332344213e-07, + "loss": 0.4203, + "mean_token_accuracy": 0.856996476650238, + "num_tokens": 80815166.0, + "step": 2115 + }, + { + "epoch": 0.2691769494975194, + "grad_norm": 1.5655566453933716, + "learning_rate": 8.965663416701992e-07, + "loss": 0.4021, + "mean_token_accuracy": 0.8652412295341492, + "num_tokens": 80854962.0, + "step": 2116 + }, + { + "epoch": 0.2693041597761099, + "grad_norm": 1.5294160842895508, + "learning_rate": 8.969902501059771e-07, + "loss": 0.4534, + "mean_token_accuracy": 0.8478252291679382, + "num_tokens": 80895169.0, + "step": 2117 + }, + { + "epoch": 0.2694313700547004, + "grad_norm": 1.638601541519165, + "learning_rate": 8.97414158541755e-07, + "loss": 0.4629, + "mean_token_accuracy": 0.8456723093986511, + "num_tokens": 80931853.0, + "step": 2118 + }, + { + "epoch": 0.26955858033329094, + "grad_norm": 1.4380587339401245, + "learning_rate": 8.978380669775328e-07, + "loss": 0.4196, + "mean_token_accuracy": 0.8607953190803528, + "num_tokens": 80974244.0, + "step": 2119 + }, + { + "epoch": 0.2696857906118814, + "grad_norm": 1.4603554010391235, + "learning_rate": 8.982619754133107e-07, + "loss": 0.3894, + "mean_token_accuracy": 0.8682896494865417, + "num_tokens": 81014539.0, + "step": 2120 + }, + { + "epoch": 0.26981300089047194, + "grad_norm": 1.533500075340271, + "learning_rate": 8.986858838490886e-07, + "loss": 0.4382, + "mean_token_accuracy": 0.8523076176643372, + "num_tokens": 81052984.0, + "step": 2121 + }, + { + "epoch": 0.26994021116906247, + "grad_norm": 1.5783898830413818, + "learning_rate": 8.991097922848663e-07, + "loss": 0.4996, + "mean_token_accuracy": 0.8332550525665283, + "num_tokens": 81095422.0, + "step": 2122 + }, + { + "epoch": 0.27006742144765294, + "grad_norm": 1.6148793697357178, + "learning_rate": 8.995337007206443e-07, + "loss": 0.4192, + "mean_token_accuracy": 0.8596264123916626, + "num_tokens": 81128900.0, + "step": 2123 + }, + { + "epoch": 0.2701946317262435, + "grad_norm": 1.4556819200515747, + "learning_rate": 8.999576091564221e-07, + "loss": 0.4025, + "mean_token_accuracy": 0.8656758666038513, + "num_tokens": 81169217.0, + "step": 2124 + }, + { + "epoch": 0.270321842004834, + "grad_norm": 1.5356312990188599, + "learning_rate": 9.003815175922001e-07, + "loss": 0.3938, + "mean_token_accuracy": 0.8675397634506226, + "num_tokens": 81205649.0, + "step": 2125 + }, + { + "epoch": 0.2704490522834245, + "grad_norm": 1.581397533416748, + "learning_rate": 9.008054260279779e-07, + "loss": 0.4194, + "mean_token_accuracy": 0.8586687445640564, + "num_tokens": 81241665.0, + "step": 2126 + }, + { + "epoch": 0.270576262562015, + "grad_norm": 1.638871431350708, + "learning_rate": 9.012293344637558e-07, + "loss": 0.4436, + "mean_token_accuracy": 0.8490592241287231, + "num_tokens": 81275632.0, + "step": 2127 + }, + { + "epoch": 0.27070347284060553, + "grad_norm": 1.5951793193817139, + "learning_rate": 9.016532428995337e-07, + "loss": 0.46, + "mean_token_accuracy": 0.8428066968917847, + "num_tokens": 81312461.0, + "step": 2128 + }, + { + "epoch": 0.27083068311919606, + "grad_norm": 1.5016940832138062, + "learning_rate": 9.020771513353115e-07, + "loss": 0.4298, + "mean_token_accuracy": 0.8558381199836731, + "num_tokens": 81356428.0, + "step": 2129 + }, + { + "epoch": 0.27095789339778653, + "grad_norm": 1.6626843214035034, + "learning_rate": 9.025010597710894e-07, + "loss": 0.3964, + "mean_token_accuracy": 0.8643748760223389, + "num_tokens": 81388381.0, + "step": 2130 + }, + { + "epoch": 0.27108510367637706, + "grad_norm": 1.3906056880950928, + "learning_rate": 9.029249682068673e-07, + "loss": 0.4011, + "mean_token_accuracy": 0.8656717538833618, + "num_tokens": 81431824.0, + "step": 2131 + }, + { + "epoch": 0.2712123139549676, + "grad_norm": 1.5432103872299194, + "learning_rate": 9.033488766426451e-07, + "loss": 0.4267, + "mean_token_accuracy": 0.8556028604507446, + "num_tokens": 81470408.0, + "step": 2132 + }, + { + "epoch": 0.27133952423355806, + "grad_norm": 1.6484906673431396, + "learning_rate": 9.037727850784231e-07, + "loss": 0.481, + "mean_token_accuracy": 0.8439228534698486, + "num_tokens": 81508932.0, + "step": 2133 + }, + { + "epoch": 0.2714667345121486, + "grad_norm": 1.5651805400848389, + "learning_rate": 9.041966935142009e-07, + "loss": 0.3853, + "mean_token_accuracy": 0.8696572780609131, + "num_tokens": 81542909.0, + "step": 2134 + }, + { + "epoch": 0.2715939447907391, + "grad_norm": 1.7051249742507935, + "learning_rate": 9.046206019499788e-07, + "loss": 0.445, + "mean_token_accuracy": 0.8491528034210205, + "num_tokens": 81581990.0, + "step": 2135 + }, + { + "epoch": 0.2717211550693296, + "grad_norm": 1.6161508560180664, + "learning_rate": 9.050445103857567e-07, + "loss": 0.408, + "mean_token_accuracy": 0.8631484508514404, + "num_tokens": 81617114.0, + "step": 2136 + }, + { + "epoch": 0.2718483653479201, + "grad_norm": 1.5918028354644775, + "learning_rate": 9.054684188215344e-07, + "loss": 0.4277, + "mean_token_accuracy": 0.8590986728668213, + "num_tokens": 81655840.0, + "step": 2137 + }, + { + "epoch": 0.27197557562651065, + "grad_norm": 1.465335726737976, + "learning_rate": 9.058923272573124e-07, + "loss": 0.3972, + "mean_token_accuracy": 0.862726092338562, + "num_tokens": 81692883.0, + "step": 2138 + }, + { + "epoch": 0.2721027859051011, + "grad_norm": 1.627916693687439, + "learning_rate": 9.063162356930902e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8407664895057678, + "num_tokens": 81729215.0, + "step": 2139 + }, + { + "epoch": 0.27222999618369165, + "grad_norm": 1.6341071128845215, + "learning_rate": 9.067401441288681e-07, + "loss": 0.4438, + "mean_token_accuracy": 0.850999653339386, + "num_tokens": 81763451.0, + "step": 2140 + }, + { + "epoch": 0.2723572064622822, + "grad_norm": 1.5515638589859009, + "learning_rate": 9.07164052564646e-07, + "loss": 0.4345, + "mean_token_accuracy": 0.8579939007759094, + "num_tokens": 81807490.0, + "step": 2141 + }, + { + "epoch": 0.27248441674087265, + "grad_norm": 1.6173455715179443, + "learning_rate": 9.075879610004239e-07, + "loss": 0.4017, + "mean_token_accuracy": 0.8650323152542114, + "num_tokens": 81847038.0, + "step": 2142 + }, + { + "epoch": 0.2726116270194632, + "grad_norm": 1.7588200569152832, + "learning_rate": 9.080118694362017e-07, + "loss": 0.4414, + "mean_token_accuracy": 0.8489864468574524, + "num_tokens": 81879607.0, + "step": 2143 + }, + { + "epoch": 0.2727388372980537, + "grad_norm": 1.5727813243865967, + "learning_rate": 9.084357778719796e-07, + "loss": 0.429, + "mean_token_accuracy": 0.8539859056472778, + "num_tokens": 81916496.0, + "step": 2144 + }, + { + "epoch": 0.2728660475766442, + "grad_norm": 1.5374374389648438, + "learning_rate": 9.088596863077574e-07, + "loss": 0.4149, + "mean_token_accuracy": 0.8596864938735962, + "num_tokens": 81957162.0, + "step": 2145 + }, + { + "epoch": 0.2729932578552347, + "grad_norm": 1.611296534538269, + "learning_rate": 9.092835947435354e-07, + "loss": 0.4681, + "mean_token_accuracy": 0.8437866568565369, + "num_tokens": 81992564.0, + "step": 2146 + }, + { + "epoch": 0.27312046813382523, + "grad_norm": 1.754274845123291, + "learning_rate": 9.097075031793132e-07, + "loss": 0.5081, + "mean_token_accuracy": 0.833207368850708, + "num_tokens": 82029222.0, + "step": 2147 + }, + { + "epoch": 0.2732476784124157, + "grad_norm": 1.6300755739212036, + "learning_rate": 9.101314116150911e-07, + "loss": 0.4645, + "mean_token_accuracy": 0.844707727432251, + "num_tokens": 82066413.0, + "step": 2148 + }, + { + "epoch": 0.27337488869100623, + "grad_norm": 1.5462634563446045, + "learning_rate": 9.10555320050869e-07, + "loss": 0.4403, + "mean_token_accuracy": 0.849591851234436, + "num_tokens": 82103661.0, + "step": 2149 + }, + { + "epoch": 0.27350209896959676, + "grad_norm": 1.5665515661239624, + "learning_rate": 9.109792284866469e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.8537114858627319, + "num_tokens": 82141415.0, + "step": 2150 + }, + { + "epoch": 0.27362930924818724, + "grad_norm": 1.6019400358200073, + "learning_rate": 9.114031369224247e-07, + "loss": 0.4437, + "mean_token_accuracy": 0.853229284286499, + "num_tokens": 82180129.0, + "step": 2151 + }, + { + "epoch": 0.27375651952677776, + "grad_norm": 1.5605684518814087, + "learning_rate": 9.118270453582026e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.8503667116165161, + "num_tokens": 82222116.0, + "step": 2152 + }, + { + "epoch": 0.2738837298053683, + "grad_norm": 1.5712389945983887, + "learning_rate": 9.122509537939804e-07, + "loss": 0.4291, + "mean_token_accuracy": 0.8565109372138977, + "num_tokens": 82260579.0, + "step": 2153 + }, + { + "epoch": 0.27401094008395877, + "grad_norm": 1.555984616279602, + "learning_rate": 9.126748622297584e-07, + "loss": 0.4622, + "mean_token_accuracy": 0.8518960475921631, + "num_tokens": 82300460.0, + "step": 2154 + }, + { + "epoch": 0.2741381503625493, + "grad_norm": 1.610297679901123, + "learning_rate": 9.130987706655362e-07, + "loss": 0.3898, + "mean_token_accuracy": 0.8689589500427246, + "num_tokens": 82334407.0, + "step": 2155 + }, + { + "epoch": 0.2742653606411398, + "grad_norm": 1.5171821117401123, + "learning_rate": 9.135226791013141e-07, + "loss": 0.4485, + "mean_token_accuracy": 0.849373459815979, + "num_tokens": 82373263.0, + "step": 2156 + }, + { + "epoch": 0.2743925709197303, + "grad_norm": 1.4955604076385498, + "learning_rate": 9.13946587537092e-07, + "loss": 0.3977, + "mean_token_accuracy": 0.8663689494132996, + "num_tokens": 82412878.0, + "step": 2157 + }, + { + "epoch": 0.2745197811983208, + "grad_norm": 1.5779194831848145, + "learning_rate": 9.143704959728699e-07, + "loss": 0.3892, + "mean_token_accuracy": 0.867812991142273, + "num_tokens": 82447310.0, + "step": 2158 + }, + { + "epoch": 0.27464699147691135, + "grad_norm": 1.51838219165802, + "learning_rate": 9.147944044086476e-07, + "loss": 0.4238, + "mean_token_accuracy": 0.8565244674682617, + "num_tokens": 82486035.0, + "step": 2159 + }, + { + "epoch": 0.2747742017555018, + "grad_norm": 1.5191417932510376, + "learning_rate": 9.152183128444255e-07, + "loss": 0.4274, + "mean_token_accuracy": 0.8589320778846741, + "num_tokens": 82524839.0, + "step": 2160 + }, + { + "epoch": 0.27490141203409235, + "grad_norm": 1.654909610748291, + "learning_rate": 9.156422212802034e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8438690900802612, + "num_tokens": 82564649.0, + "step": 2161 + }, + { + "epoch": 0.2750286223126829, + "grad_norm": 1.581939935684204, + "learning_rate": 9.160661297159813e-07, + "loss": 0.4439, + "mean_token_accuracy": 0.854102611541748, + "num_tokens": 82601946.0, + "step": 2162 + }, + { + "epoch": 0.27515583259127335, + "grad_norm": 1.6732823848724365, + "learning_rate": 9.164900381517592e-07, + "loss": 0.4414, + "mean_token_accuracy": 0.8514962196350098, + "num_tokens": 82635994.0, + "step": 2163 + }, + { + "epoch": 0.2752830428698639, + "grad_norm": 1.5052096843719482, + "learning_rate": 9.16913946587537e-07, + "loss": 0.4037, + "mean_token_accuracy": 0.8651483058929443, + "num_tokens": 82676502.0, + "step": 2164 + }, + { + "epoch": 0.2754102531484544, + "grad_norm": 1.6336135864257812, + "learning_rate": 9.17337855023315e-07, + "loss": 0.4554, + "mean_token_accuracy": 0.8471472263336182, + "num_tokens": 82712248.0, + "step": 2165 + }, + { + "epoch": 0.2755374634270449, + "grad_norm": 1.5033477544784546, + "learning_rate": 9.177617634590928e-07, + "loss": 0.4436, + "mean_token_accuracy": 0.8497260808944702, + "num_tokens": 82755517.0, + "step": 2166 + }, + { + "epoch": 0.2756646737056354, + "grad_norm": 1.5024397373199463, + "learning_rate": 9.181856718948706e-07, + "loss": 0.3944, + "mean_token_accuracy": 0.8643894195556641, + "num_tokens": 82791585.0, + "step": 2167 + }, + { + "epoch": 0.27579188398422594, + "grad_norm": 1.491916537284851, + "learning_rate": 9.186095803306485e-07, + "loss": 0.4024, + "mean_token_accuracy": 0.8648828268051147, + "num_tokens": 82828703.0, + "step": 2168 + }, + { + "epoch": 0.2759190942628164, + "grad_norm": 1.5073356628417969, + "learning_rate": 9.190334887664264e-07, + "loss": 0.421, + "mean_token_accuracy": 0.859870433807373, + "num_tokens": 82865633.0, + "step": 2169 + }, + { + "epoch": 0.27604630454140694, + "grad_norm": 1.5153367519378662, + "learning_rate": 9.194573972022043e-07, + "loss": 0.394, + "mean_token_accuracy": 0.8680694103240967, + "num_tokens": 82907264.0, + "step": 2170 + }, + { + "epoch": 0.27617351481999747, + "grad_norm": 1.435413122177124, + "learning_rate": 9.198813056379822e-07, + "loss": 0.4016, + "mean_token_accuracy": 0.8640708923339844, + "num_tokens": 82946856.0, + "step": 2171 + }, + { + "epoch": 0.27630072509858794, + "grad_norm": 1.5095770359039307, + "learning_rate": 9.2030521407376e-07, + "loss": 0.3998, + "mean_token_accuracy": 0.863517701625824, + "num_tokens": 82983648.0, + "step": 2172 + }, + { + "epoch": 0.27642793537717847, + "grad_norm": 1.6095253229141235, + "learning_rate": 9.20729122509538e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.8420819044113159, + "num_tokens": 83023247.0, + "step": 2173 + }, + { + "epoch": 0.276555145655769, + "grad_norm": 1.5151203870773315, + "learning_rate": 9.211530309453158e-07, + "loss": 0.4169, + "mean_token_accuracy": 0.8615522384643555, + "num_tokens": 83061176.0, + "step": 2174 + }, + { + "epoch": 0.27668235593435947, + "grad_norm": 1.4612747430801392, + "learning_rate": 9.215769393810936e-07, + "loss": 0.426, + "mean_token_accuracy": 0.856224536895752, + "num_tokens": 83103687.0, + "step": 2175 + }, + { + "epoch": 0.27680956621295, + "grad_norm": 1.4556732177734375, + "learning_rate": 9.220008478168715e-07, + "loss": 0.4394, + "mean_token_accuracy": 0.8549554347991943, + "num_tokens": 83146766.0, + "step": 2176 + }, + { + "epoch": 0.2769367764915405, + "grad_norm": 1.5000911951065063, + "learning_rate": 9.224247562526494e-07, + "loss": 0.4285, + "mean_token_accuracy": 0.8532974720001221, + "num_tokens": 83189695.0, + "step": 2177 + }, + { + "epoch": 0.277063986770131, + "grad_norm": 1.5164424180984497, + "learning_rate": 9.228486646884273e-07, + "loss": 0.3773, + "mean_token_accuracy": 0.8711062669754028, + "num_tokens": 83225698.0, + "step": 2178 + }, + { + "epoch": 0.2771911970487215, + "grad_norm": 1.4329848289489746, + "learning_rate": 9.232725731242052e-07, + "loss": 0.4193, + "mean_token_accuracy": 0.8590552806854248, + "num_tokens": 83269037.0, + "step": 2179 + }, + { + "epoch": 0.27731840732731206, + "grad_norm": 1.7288053035736084, + "learning_rate": 9.23696481559983e-07, + "loss": 0.4382, + "mean_token_accuracy": 0.8529846668243408, + "num_tokens": 83303783.0, + "step": 2180 + }, + { + "epoch": 0.2774456176059026, + "grad_norm": 1.6208664178848267, + "learning_rate": 9.24120389995761e-07, + "loss": 0.4547, + "mean_token_accuracy": 0.8473386764526367, + "num_tokens": 83338713.0, + "step": 2181 + }, + { + "epoch": 0.27757282788449306, + "grad_norm": 1.5617029666900635, + "learning_rate": 9.245442984315387e-07, + "loss": 0.4567, + "mean_token_accuracy": 0.8494723439216614, + "num_tokens": 83381478.0, + "step": 2182 + }, + { + "epoch": 0.2777000381630836, + "grad_norm": 1.570131540298462, + "learning_rate": 9.249682068673165e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.8658539056777954, + "num_tokens": 83419862.0, + "step": 2183 + }, + { + "epoch": 0.2778272484416741, + "grad_norm": 1.4955592155456543, + "learning_rate": 9.253921153030945e-07, + "loss": 0.4083, + "mean_token_accuracy": 0.8597840070724487, + "num_tokens": 83457122.0, + "step": 2184 + }, + { + "epoch": 0.2779544587202646, + "grad_norm": 1.3919827938079834, + "learning_rate": 9.258160237388723e-07, + "loss": 0.3498, + "mean_token_accuracy": 0.8784346580505371, + "num_tokens": 83497340.0, + "step": 2185 + }, + { + "epoch": 0.2780816689988551, + "grad_norm": 1.6138116121292114, + "learning_rate": 9.262399321746503e-07, + "loss": 0.493, + "mean_token_accuracy": 0.840710461139679, + "num_tokens": 83536932.0, + "step": 2186 + }, + { + "epoch": 0.27820887927744564, + "grad_norm": 1.5558831691741943, + "learning_rate": 9.266638406104281e-07, + "loss": 0.4123, + "mean_token_accuracy": 0.8629026412963867, + "num_tokens": 83574840.0, + "step": 2187 + }, + { + "epoch": 0.2783360895560361, + "grad_norm": 1.5563726425170898, + "learning_rate": 9.27087749046206e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8478875756263733, + "num_tokens": 83615522.0, + "step": 2188 + }, + { + "epoch": 0.27846329983462664, + "grad_norm": 1.3895021677017212, + "learning_rate": 9.275116574819839e-07, + "loss": 0.3673, + "mean_token_accuracy": 0.8760485649108887, + "num_tokens": 83656109.0, + "step": 2189 + }, + { + "epoch": 0.27859051011321717, + "grad_norm": 1.5347630977630615, + "learning_rate": 9.279355659177617e-07, + "loss": 0.4503, + "mean_token_accuracy": 0.8513020873069763, + "num_tokens": 83693682.0, + "step": 2190 + }, + { + "epoch": 0.27871772039180764, + "grad_norm": 1.5759469270706177, + "learning_rate": 9.283594743535395e-07, + "loss": 0.4331, + "mean_token_accuracy": 0.8562544584274292, + "num_tokens": 83731575.0, + "step": 2191 + }, + { + "epoch": 0.2788449306703982, + "grad_norm": 1.6210318803787231, + "learning_rate": 9.287833827893175e-07, + "loss": 0.4406, + "mean_token_accuracy": 0.850324809551239, + "num_tokens": 83766248.0, + "step": 2192 + }, + { + "epoch": 0.2789721409489887, + "grad_norm": 1.4989597797393799, + "learning_rate": 9.292072912250953e-07, + "loss": 0.4383, + "mean_token_accuracy": 0.8526315093040466, + "num_tokens": 83806834.0, + "step": 2193 + }, + { + "epoch": 0.2790993512275792, + "grad_norm": 1.5537127256393433, + "learning_rate": 9.296311996608733e-07, + "loss": 0.4628, + "mean_token_accuracy": 0.8468319177627563, + "num_tokens": 83848672.0, + "step": 2194 + }, + { + "epoch": 0.2792265615061697, + "grad_norm": 1.6415388584136963, + "learning_rate": 9.300551080966511e-07, + "loss": 0.4518, + "mean_token_accuracy": 0.8460452556610107, + "num_tokens": 83882587.0, + "step": 2195 + }, + { + "epoch": 0.27935377178476023, + "grad_norm": 1.6299760341644287, + "learning_rate": 9.30479016532429e-07, + "loss": 0.4173, + "mean_token_accuracy": 0.8579957485198975, + "num_tokens": 83917777.0, + "step": 2196 + }, + { + "epoch": 0.2794809820633507, + "grad_norm": 1.4997864961624146, + "learning_rate": 9.309029249682068e-07, + "loss": 0.3787, + "mean_token_accuracy": 0.8671751022338867, + "num_tokens": 83956812.0, + "step": 2197 + }, + { + "epoch": 0.27960819234194123, + "grad_norm": 1.669103980064392, + "learning_rate": 9.313268334039847e-07, + "loss": 0.4025, + "mean_token_accuracy": 0.8641928434371948, + "num_tokens": 83993157.0, + "step": 2198 + }, + { + "epoch": 0.27973540262053176, + "grad_norm": 1.5223945379257202, + "learning_rate": 9.317507418397625e-07, + "loss": 0.4369, + "mean_token_accuracy": 0.8540189266204834, + "num_tokens": 84031785.0, + "step": 2199 + }, + { + "epoch": 0.27986261289912223, + "grad_norm": 1.6806854009628296, + "learning_rate": 9.321746502755404e-07, + "loss": 0.4645, + "mean_token_accuracy": 0.8480669260025024, + "num_tokens": 84066766.0, + "step": 2200 + }, + { + "epoch": 0.27998982317771276, + "grad_norm": 1.538877248764038, + "learning_rate": 9.325985587113183e-07, + "loss": 0.4337, + "mean_token_accuracy": 0.855000913143158, + "num_tokens": 84106549.0, + "step": 2201 + }, + { + "epoch": 0.2801170334563033, + "grad_norm": 1.6588941812515259, + "learning_rate": 9.330224671470962e-07, + "loss": 0.4996, + "mean_token_accuracy": 0.8421416282653809, + "num_tokens": 84144948.0, + "step": 2202 + }, + { + "epoch": 0.28024424373489376, + "grad_norm": 1.7796640396118164, + "learning_rate": 9.334463755828741e-07, + "loss": 0.4656, + "mean_token_accuracy": 0.8462076187133789, + "num_tokens": 84177325.0, + "step": 2203 + }, + { + "epoch": 0.2803714540134843, + "grad_norm": 1.4378843307495117, + "learning_rate": 9.338702840186519e-07, + "loss": 0.3936, + "mean_token_accuracy": 0.8659113645553589, + "num_tokens": 84220471.0, + "step": 2204 + }, + { + "epoch": 0.2804986642920748, + "grad_norm": 1.5674704313278198, + "learning_rate": 9.342941924544298e-07, + "loss": 0.4121, + "mean_token_accuracy": 0.8587727546691895, + "num_tokens": 84256808.0, + "step": 2205 + }, + { + "epoch": 0.2806258745706653, + "grad_norm": 1.5359454154968262, + "learning_rate": 9.347181008902076e-07, + "loss": 0.4279, + "mean_token_accuracy": 0.8557677268981934, + "num_tokens": 84295212.0, + "step": 2206 + }, + { + "epoch": 0.2807530848492558, + "grad_norm": 1.8264201879501343, + "learning_rate": 9.351420093259855e-07, + "loss": 0.4094, + "mean_token_accuracy": 0.8586187362670898, + "num_tokens": 84323498.0, + "step": 2207 + }, + { + "epoch": 0.28088029512784635, + "grad_norm": 1.5510720014572144, + "learning_rate": 9.355659177617634e-07, + "loss": 0.416, + "mean_token_accuracy": 0.860710859298706, + "num_tokens": 84361714.0, + "step": 2208 + }, + { + "epoch": 0.2810075054064368, + "grad_norm": 1.5579109191894531, + "learning_rate": 9.359898261975413e-07, + "loss": 0.4569, + "mean_token_accuracy": 0.8471994400024414, + "num_tokens": 84404611.0, + "step": 2209 + }, + { + "epoch": 0.28113471568502735, + "grad_norm": 1.7012073993682861, + "learning_rate": 9.364137346333192e-07, + "loss": 0.4593, + "mean_token_accuracy": 0.8482838273048401, + "num_tokens": 84443178.0, + "step": 2210 + }, + { + "epoch": 0.2812619259636179, + "grad_norm": 1.54647958278656, + "learning_rate": 9.368376430690971e-07, + "loss": 0.4221, + "mean_token_accuracy": 0.8590694069862366, + "num_tokens": 84482629.0, + "step": 2211 + }, + { + "epoch": 0.28138913624220835, + "grad_norm": 1.46537184715271, + "learning_rate": 9.372615515048749e-07, + "loss": 0.3965, + "mean_token_accuracy": 0.8657100200653076, + "num_tokens": 84521159.0, + "step": 2212 + }, + { + "epoch": 0.2815163465207989, + "grad_norm": 1.5771883726119995, + "learning_rate": 9.376854599406528e-07, + "loss": 0.4399, + "mean_token_accuracy": 0.8508026599884033, + "num_tokens": 84558098.0, + "step": 2213 + }, + { + "epoch": 0.2816435567993894, + "grad_norm": 1.649823784828186, + "learning_rate": 9.381093683764306e-07, + "loss": 0.4342, + "mean_token_accuracy": 0.8513245582580566, + "num_tokens": 84593384.0, + "step": 2214 + }, + { + "epoch": 0.2817707670779799, + "grad_norm": 1.4855226278305054, + "learning_rate": 9.385332768122085e-07, + "loss": 0.4628, + "mean_token_accuracy": 0.8441493511199951, + "num_tokens": 84636150.0, + "step": 2215 + }, + { + "epoch": 0.2818979773565704, + "grad_norm": 1.6382315158843994, + "learning_rate": 9.389571852479864e-07, + "loss": 0.4629, + "mean_token_accuracy": 0.8464510440826416, + "num_tokens": 84673468.0, + "step": 2216 + }, + { + "epoch": 0.28202518763516093, + "grad_norm": 1.6533221006393433, + "learning_rate": 9.393810936837643e-07, + "loss": 0.4317, + "mean_token_accuracy": 0.8521289825439453, + "num_tokens": 84708867.0, + "step": 2217 + }, + { + "epoch": 0.2821523979137514, + "grad_norm": 1.5880316495895386, + "learning_rate": 9.398050021195422e-07, + "loss": 0.4245, + "mean_token_accuracy": 0.8547117710113525, + "num_tokens": 84742059.0, + "step": 2218 + }, + { + "epoch": 0.28227960819234194, + "grad_norm": 1.5529582500457764, + "learning_rate": 9.402289105553201e-07, + "loss": 0.4496, + "mean_token_accuracy": 0.8508602380752563, + "num_tokens": 84780439.0, + "step": 2219 + }, + { + "epoch": 0.28240681847093246, + "grad_norm": 1.561176061630249, + "learning_rate": 9.406528189910978e-07, + "loss": 0.4491, + "mean_token_accuracy": 0.8474513292312622, + "num_tokens": 84819931.0, + "step": 2220 + }, + { + "epoch": 0.28253402874952294, + "grad_norm": 1.652510166168213, + "learning_rate": 9.410767274268757e-07, + "loss": 0.4351, + "mean_token_accuracy": 0.8519890308380127, + "num_tokens": 84855549.0, + "step": 2221 + }, + { + "epoch": 0.28266123902811346, + "grad_norm": 1.5784599781036377, + "learning_rate": 9.415006358626536e-07, + "loss": 0.4242, + "mean_token_accuracy": 0.8566007018089294, + "num_tokens": 84890987.0, + "step": 2222 + }, + { + "epoch": 0.282788449306704, + "grad_norm": 1.5004578828811646, + "learning_rate": 9.419245442984314e-07, + "loss": 0.3874, + "mean_token_accuracy": 0.8685135245323181, + "num_tokens": 84928664.0, + "step": 2223 + }, + { + "epoch": 0.28291565958529447, + "grad_norm": 1.731076717376709, + "learning_rate": 9.423484527342094e-07, + "loss": 0.457, + "mean_token_accuracy": 0.8463874459266663, + "num_tokens": 84961802.0, + "step": 2224 + }, + { + "epoch": 0.283042869863885, + "grad_norm": 1.5407721996307373, + "learning_rate": 9.427723611699872e-07, + "loss": 0.4653, + "mean_token_accuracy": 0.8458570241928101, + "num_tokens": 85001075.0, + "step": 2225 + }, + { + "epoch": 0.2831700801424755, + "grad_norm": 1.3918020725250244, + "learning_rate": 9.431962696057652e-07, + "loss": 0.409, + "mean_token_accuracy": 0.8621834516525269, + "num_tokens": 85044173.0, + "step": 2226 + }, + { + "epoch": 0.283297290421066, + "grad_norm": 1.6195908784866333, + "learning_rate": 9.43620178041543e-07, + "loss": 0.458, + "mean_token_accuracy": 0.8503255248069763, + "num_tokens": 85083220.0, + "step": 2227 + }, + { + "epoch": 0.2834245006996565, + "grad_norm": 1.6737030744552612, + "learning_rate": 9.440440864773208e-07, + "loss": 0.4044, + "mean_token_accuracy": 0.8606877326965332, + "num_tokens": 85118082.0, + "step": 2228 + }, + { + "epoch": 0.28355171097824705, + "grad_norm": 1.4587680101394653, + "learning_rate": 9.444679949130987e-07, + "loss": 0.4123, + "mean_token_accuracy": 0.8557918071746826, + "num_tokens": 85158267.0, + "step": 2229 + }, + { + "epoch": 0.2836789212568376, + "grad_norm": 1.5648351907730103, + "learning_rate": 9.448919033488766e-07, + "loss": 0.3743, + "mean_token_accuracy": 0.8676215410232544, + "num_tokens": 85190587.0, + "step": 2230 + }, + { + "epoch": 0.28380613153542805, + "grad_norm": 1.5142502784729004, + "learning_rate": 9.453158117846544e-07, + "loss": 0.4097, + "mean_token_accuracy": 0.8575007319450378, + "num_tokens": 85227683.0, + "step": 2231 + }, + { + "epoch": 0.2839333418140186, + "grad_norm": 1.611985206604004, + "learning_rate": 9.457397202204324e-07, + "loss": 0.3926, + "mean_token_accuracy": 0.8656816482543945, + "num_tokens": 85263174.0, + "step": 2232 + }, + { + "epoch": 0.2840605520926091, + "grad_norm": 1.7601804733276367, + "learning_rate": 9.461636286562102e-07, + "loss": 0.4824, + "mean_token_accuracy": 0.8386558890342712, + "num_tokens": 85300954.0, + "step": 2233 + }, + { + "epoch": 0.2841877623711996, + "grad_norm": 1.6227697134017944, + "learning_rate": 9.465875370919882e-07, + "loss": 0.4648, + "mean_token_accuracy": 0.8478351831436157, + "num_tokens": 85339246.0, + "step": 2234 + }, + { + "epoch": 0.2843149726497901, + "grad_norm": 1.464145302772522, + "learning_rate": 9.470114455277659e-07, + "loss": 0.4283, + "mean_token_accuracy": 0.85780268907547, + "num_tokens": 85381118.0, + "step": 2235 + }, + { + "epoch": 0.28444218292838064, + "grad_norm": 1.4343360662460327, + "learning_rate": 9.474353539635438e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8436049818992615, + "num_tokens": 85427722.0, + "step": 2236 + }, + { + "epoch": 0.2845693932069711, + "grad_norm": 1.654834270477295, + "learning_rate": 9.478592623993217e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8433902859687805, + "num_tokens": 85465918.0, + "step": 2237 + }, + { + "epoch": 0.28469660348556164, + "grad_norm": 1.5977884531021118, + "learning_rate": 9.482831708350996e-07, + "loss": 0.4474, + "mean_token_accuracy": 0.8471583127975464, + "num_tokens": 85507021.0, + "step": 2238 + }, + { + "epoch": 0.28482381376415217, + "grad_norm": 1.811458706855774, + "learning_rate": 9.487070792708775e-07, + "loss": 0.4247, + "mean_token_accuracy": 0.8548687696456909, + "num_tokens": 85539838.0, + "step": 2239 + }, + { + "epoch": 0.28495102404274264, + "grad_norm": 1.5336196422576904, + "learning_rate": 9.491309877066554e-07, + "loss": 0.4124, + "mean_token_accuracy": 0.8587645292282104, + "num_tokens": 85581185.0, + "step": 2240 + }, + { + "epoch": 0.28507823432133317, + "grad_norm": 1.498063564300537, + "learning_rate": 9.495548961424332e-07, + "loss": 0.4529, + "mean_token_accuracy": 0.8472012281417847, + "num_tokens": 85622494.0, + "step": 2241 + }, + { + "epoch": 0.2852054445999237, + "grad_norm": 1.4269319772720337, + "learning_rate": 9.499788045782111e-07, + "loss": 0.4123, + "mean_token_accuracy": 0.8602935671806335, + "num_tokens": 85666360.0, + "step": 2242 + }, + { + "epoch": 0.28533265487851417, + "grad_norm": 1.6697372198104858, + "learning_rate": 9.504027130139889e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8437343239784241, + "num_tokens": 85700390.0, + "step": 2243 + }, + { + "epoch": 0.2854598651571047, + "grad_norm": 1.702068567276001, + "learning_rate": 9.508266214497667e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8421528935432434, + "num_tokens": 85736915.0, + "step": 2244 + }, + { + "epoch": 0.2855870754356952, + "grad_norm": 1.553464651107788, + "learning_rate": 9.512505298855447e-07, + "loss": 0.3997, + "mean_token_accuracy": 0.8665453791618347, + "num_tokens": 85776430.0, + "step": 2245 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 1.547821044921875, + "learning_rate": 9.516744383213225e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.8445698618888855, + "num_tokens": 85820700.0, + "step": 2246 + }, + { + "epoch": 0.2858414959928762, + "grad_norm": 1.7107529640197754, + "learning_rate": 9.520983467571005e-07, + "loss": 0.4254, + "mean_token_accuracy": 0.8546836376190186, + "num_tokens": 85857954.0, + "step": 2247 + }, + { + "epoch": 0.28596870627146675, + "grad_norm": 1.707823634147644, + "learning_rate": 9.525222551928783e-07, + "loss": 0.4965, + "mean_token_accuracy": 0.8375692963600159, + "num_tokens": 85892297.0, + "step": 2248 + }, + { + "epoch": 0.2860959165500572, + "grad_norm": 1.4225287437438965, + "learning_rate": 9.529461636286562e-07, + "loss": 0.4249, + "mean_token_accuracy": 0.8549458980560303, + "num_tokens": 85933624.0, + "step": 2249 + }, + { + "epoch": 0.28622312682864776, + "grad_norm": 1.5282679796218872, + "learning_rate": 9.533700720644341e-07, + "loss": 0.4399, + "mean_token_accuracy": 0.8518396615982056, + "num_tokens": 85973782.0, + "step": 2250 + }, + { + "epoch": 0.2863503371072383, + "grad_norm": 1.8501542806625366, + "learning_rate": 9.537939805002118e-07, + "loss": 0.4649, + "mean_token_accuracy": 0.8454570174217224, + "num_tokens": 86004174.0, + "step": 2251 + }, + { + "epoch": 0.28647754738582876, + "grad_norm": 1.7099812030792236, + "learning_rate": 9.542178889359898e-07, + "loss": 0.4415, + "mean_token_accuracy": 0.8520637154579163, + "num_tokens": 86039888.0, + "step": 2252 + }, + { + "epoch": 0.2866047576644193, + "grad_norm": 1.5468116998672485, + "learning_rate": 9.546417973717677e-07, + "loss": 0.3999, + "mean_token_accuracy": 0.8646615743637085, + "num_tokens": 86076642.0, + "step": 2253 + }, + { + "epoch": 0.2867319679430098, + "grad_norm": 1.4886155128479004, + "learning_rate": 9.550657058075455e-07, + "loss": 0.4086, + "mean_token_accuracy": 0.8647122383117676, + "num_tokens": 86116223.0, + "step": 2254 + }, + { + "epoch": 0.2868591782216003, + "grad_norm": 1.503873348236084, + "learning_rate": 9.554896142433234e-07, + "loss": 0.422, + "mean_token_accuracy": 0.8557367920875549, + "num_tokens": 86157018.0, + "step": 2255 + }, + { + "epoch": 0.2869863885001908, + "grad_norm": 1.500373363494873, + "learning_rate": 9.559135226791012e-07, + "loss": 0.405, + "mean_token_accuracy": 0.8636252880096436, + "num_tokens": 86195321.0, + "step": 2256 + }, + { + "epoch": 0.28711359877878134, + "grad_norm": 1.5512986183166504, + "learning_rate": 9.563374311148793e-07, + "loss": 0.3725, + "mean_token_accuracy": 0.8731355667114258, + "num_tokens": 86231476.0, + "step": 2257 + }, + { + "epoch": 0.2872408090573718, + "grad_norm": 1.6072250604629517, + "learning_rate": 9.56761339550657e-07, + "loss": 0.461, + "mean_token_accuracy": 0.8458700776100159, + "num_tokens": 86269622.0, + "step": 2258 + }, + { + "epoch": 0.28736801933596234, + "grad_norm": 1.5152194499969482, + "learning_rate": 9.57185247986435e-07, + "loss": 0.4315, + "mean_token_accuracy": 0.8539779186248779, + "num_tokens": 86313099.0, + "step": 2259 + }, + { + "epoch": 0.28749522961455287, + "grad_norm": 1.7052778005599976, + "learning_rate": 9.576091564222128e-07, + "loss": 0.4374, + "mean_token_accuracy": 0.8497627377510071, + "num_tokens": 86345606.0, + "step": 2260 + }, + { + "epoch": 0.28762243989314334, + "grad_norm": 1.841580867767334, + "learning_rate": 9.580330648579906e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.8368085622787476, + "num_tokens": 86383371.0, + "step": 2261 + }, + { + "epoch": 0.2877496501717339, + "grad_norm": 1.6022319793701172, + "learning_rate": 9.584569732937685e-07, + "loss": 0.4206, + "mean_token_accuracy": 0.8600701093673706, + "num_tokens": 86420650.0, + "step": 2262 + }, + { + "epoch": 0.2878768604503244, + "grad_norm": 1.6395193338394165, + "learning_rate": 9.588808817295463e-07, + "loss": 0.4225, + "mean_token_accuracy": 0.8554073572158813, + "num_tokens": 86455974.0, + "step": 2263 + }, + { + "epoch": 0.2880040707289149, + "grad_norm": 1.5684263706207275, + "learning_rate": 9.593047901653242e-07, + "loss": 0.4425, + "mean_token_accuracy": 0.8513076901435852, + "num_tokens": 86495498.0, + "step": 2264 + }, + { + "epoch": 0.2881312810075054, + "grad_norm": 1.59419846534729, + "learning_rate": 9.597286986011022e-07, + "loss": 0.428, + "mean_token_accuracy": 0.8546367287635803, + "num_tokens": 86536064.0, + "step": 2265 + }, + { + "epoch": 0.28825849128609593, + "grad_norm": 1.4664654731750488, + "learning_rate": 9.601526070368799e-07, + "loss": 0.4472, + "mean_token_accuracy": 0.8511837720870972, + "num_tokens": 86581665.0, + "step": 2266 + }, + { + "epoch": 0.2883857015646864, + "grad_norm": 1.5193750858306885, + "learning_rate": 9.60576515472658e-07, + "loss": 0.4056, + "mean_token_accuracy": 0.8639590740203857, + "num_tokens": 86625610.0, + "step": 2267 + }, + { + "epoch": 0.28851291184327693, + "grad_norm": 1.5392470359802246, + "learning_rate": 9.610004239084358e-07, + "loss": 0.4788, + "mean_token_accuracy": 0.8418726921081543, + "num_tokens": 86664418.0, + "step": 2268 + }, + { + "epoch": 0.28864012212186746, + "grad_norm": 1.5910604000091553, + "learning_rate": 9.614243323442136e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8397932052612305, + "num_tokens": 86703009.0, + "step": 2269 + }, + { + "epoch": 0.28876733240045793, + "grad_norm": 1.7093580961227417, + "learning_rate": 9.618482407799915e-07, + "loss": 0.4292, + "mean_token_accuracy": 0.8553197383880615, + "num_tokens": 86737176.0, + "step": 2270 + }, + { + "epoch": 0.28889454267904846, + "grad_norm": 1.4689100980758667, + "learning_rate": 9.622721492157693e-07, + "loss": 0.4283, + "mean_token_accuracy": 0.8573065996170044, + "num_tokens": 86776750.0, + "step": 2271 + }, + { + "epoch": 0.289021752957639, + "grad_norm": 1.5940232276916504, + "learning_rate": 9.626960576515472e-07, + "loss": 0.4328, + "mean_token_accuracy": 0.855481743812561, + "num_tokens": 86816244.0, + "step": 2272 + }, + { + "epoch": 0.28914896323622946, + "grad_norm": 1.4534920454025269, + "learning_rate": 9.63119966087325e-07, + "loss": 0.3833, + "mean_token_accuracy": 0.8696932196617126, + "num_tokens": 86853472.0, + "step": 2273 + }, + { + "epoch": 0.28927617351482, + "grad_norm": 1.5294458866119385, + "learning_rate": 9.635438745231029e-07, + "loss": 0.3985, + "mean_token_accuracy": 0.8642979860305786, + "num_tokens": 86892001.0, + "step": 2274 + }, + { + "epoch": 0.2894033837934105, + "grad_norm": 1.6462538242340088, + "learning_rate": 9.63967782958881e-07, + "loss": 0.3944, + "mean_token_accuracy": 0.8662271499633789, + "num_tokens": 86926062.0, + "step": 2275 + }, + { + "epoch": 0.289530594072001, + "grad_norm": 1.454560399055481, + "learning_rate": 9.643916913946588e-07, + "loss": 0.4094, + "mean_token_accuracy": 0.8633213639259338, + "num_tokens": 86969254.0, + "step": 2276 + }, + { + "epoch": 0.2896578043505915, + "grad_norm": 1.588249683380127, + "learning_rate": 9.648155998304366e-07, + "loss": 0.4335, + "mean_token_accuracy": 0.8515122532844543, + "num_tokens": 87007489.0, + "step": 2277 + }, + { + "epoch": 0.28978501462918205, + "grad_norm": 1.6360547542572021, + "learning_rate": 9.652395082662145e-07, + "loss": 0.4243, + "mean_token_accuracy": 0.8562504053115845, + "num_tokens": 87043267.0, + "step": 2278 + }, + { + "epoch": 0.2899122249077726, + "grad_norm": 1.561572790145874, + "learning_rate": 9.656634167019923e-07, + "loss": 0.4358, + "mean_token_accuracy": 0.8481804728507996, + "num_tokens": 87081747.0, + "step": 2279 + }, + { + "epoch": 0.29003943518636305, + "grad_norm": 1.6253238916397095, + "learning_rate": 9.660873251377701e-07, + "loss": 0.435, + "mean_token_accuracy": 0.854659378528595, + "num_tokens": 87118681.0, + "step": 2280 + }, + { + "epoch": 0.2901666454649536, + "grad_norm": 1.6279587745666504, + "learning_rate": 9.66511233573548e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8429222106933594, + "num_tokens": 87151800.0, + "step": 2281 + }, + { + "epoch": 0.2902938557435441, + "grad_norm": 1.6252975463867188, + "learning_rate": 9.669351420093258e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8298963308334351, + "num_tokens": 87190826.0, + "step": 2282 + }, + { + "epoch": 0.2904210660221346, + "grad_norm": 1.5259441137313843, + "learning_rate": 9.67359050445104e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8375328779220581, + "num_tokens": 87232514.0, + "step": 2283 + }, + { + "epoch": 0.2905482763007251, + "grad_norm": 1.569757342338562, + "learning_rate": 9.677829588808817e-07, + "loss": 0.4457, + "mean_token_accuracy": 0.8508802652359009, + "num_tokens": 87274035.0, + "step": 2284 + }, + { + "epoch": 0.29067548657931563, + "grad_norm": 2.3250832557678223, + "learning_rate": 9.682068673166596e-07, + "loss": 0.4381, + "mean_token_accuracy": 0.8527284860610962, + "num_tokens": 87313101.0, + "step": 2285 + }, + { + "epoch": 0.2908026968579061, + "grad_norm": 1.500466227531433, + "learning_rate": 9.686307757524374e-07, + "loss": 0.4082, + "mean_token_accuracy": 0.8617537021636963, + "num_tokens": 87352573.0, + "step": 2286 + }, + { + "epoch": 0.29092990713649663, + "grad_norm": 1.59141206741333, + "learning_rate": 9.690546841882153e-07, + "loss": 0.4137, + "mean_token_accuracy": 0.8618805408477783, + "num_tokens": 87385863.0, + "step": 2287 + }, + { + "epoch": 0.29105711741508716, + "grad_norm": 1.4374628067016602, + "learning_rate": 9.694785926239931e-07, + "loss": 0.4279, + "mean_token_accuracy": 0.853108286857605, + "num_tokens": 87425331.0, + "step": 2288 + }, + { + "epoch": 0.29118432769367764, + "grad_norm": 1.7582248449325562, + "learning_rate": 9.69902501059771e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8412601947784424, + "num_tokens": 87457995.0, + "step": 2289 + }, + { + "epoch": 0.29131153797226816, + "grad_norm": 1.4817044734954834, + "learning_rate": 9.703264094955488e-07, + "loss": 0.4582, + "mean_token_accuracy": 0.8432481288909912, + "num_tokens": 87499673.0, + "step": 2290 + }, + { + "epoch": 0.2914387482508587, + "grad_norm": 1.5129144191741943, + "learning_rate": 9.707503179313269e-07, + "loss": 0.4388, + "mean_token_accuracy": 0.8513168096542358, + "num_tokens": 87539911.0, + "step": 2291 + }, + { + "epoch": 0.29156595852944917, + "grad_norm": 1.5266245603561401, + "learning_rate": 9.711742263671047e-07, + "loss": 0.4076, + "mean_token_accuracy": 0.8613042831420898, + "num_tokens": 87575964.0, + "step": 2292 + }, + { + "epoch": 0.2916931688080397, + "grad_norm": 1.5893990993499756, + "learning_rate": 9.715981348028826e-07, + "loss": 0.4374, + "mean_token_accuracy": 0.8549005389213562, + "num_tokens": 87613617.0, + "step": 2293 + }, + { + "epoch": 0.2918203790866302, + "grad_norm": 1.6121121644973755, + "learning_rate": 9.720220432386604e-07, + "loss": 0.4009, + "mean_token_accuracy": 0.8645784854888916, + "num_tokens": 87649075.0, + "step": 2294 + }, + { + "epoch": 0.2919475893652207, + "grad_norm": 1.6023205518722534, + "learning_rate": 9.724459516744383e-07, + "loss": 0.4435, + "mean_token_accuracy": 0.8526322841644287, + "num_tokens": 87688014.0, + "step": 2295 + }, + { + "epoch": 0.2920747996438112, + "grad_norm": 1.687204360961914, + "learning_rate": 9.728698601102161e-07, + "loss": 0.4297, + "mean_token_accuracy": 0.8566539883613586, + "num_tokens": 87720574.0, + "step": 2296 + }, + { + "epoch": 0.29220200992240175, + "grad_norm": 1.5787221193313599, + "learning_rate": 9.73293768545994e-07, + "loss": 0.4554, + "mean_token_accuracy": 0.8436095714569092, + "num_tokens": 87760233.0, + "step": 2297 + }, + { + "epoch": 0.2923292202009922, + "grad_norm": 1.6019134521484375, + "learning_rate": 9.737176769817718e-07, + "loss": 0.4061, + "mean_token_accuracy": 0.8671482801437378, + "num_tokens": 87794814.0, + "step": 2298 + }, + { + "epoch": 0.29245643047958275, + "grad_norm": 1.548668384552002, + "learning_rate": 9.741415854175499e-07, + "loss": 0.3639, + "mean_token_accuracy": 0.8779423236846924, + "num_tokens": 87829676.0, + "step": 2299 + }, + { + "epoch": 0.2925836407581733, + "grad_norm": 1.5146732330322266, + "learning_rate": 9.745654938533277e-07, + "loss": 0.4399, + "mean_token_accuracy": 0.8525471687316895, + "num_tokens": 87869570.0, + "step": 2300 + }, + { + "epoch": 0.29271085103676375, + "grad_norm": 1.5333434343338013, + "learning_rate": 9.749894022891056e-07, + "loss": 0.4036, + "mean_token_accuracy": 0.8627098798751831, + "num_tokens": 87906970.0, + "step": 2301 + }, + { + "epoch": 0.2928380613153543, + "grad_norm": 1.517073631286621, + "learning_rate": 9.754133107248834e-07, + "loss": 0.44, + "mean_token_accuracy": 0.8499341011047363, + "num_tokens": 87948410.0, + "step": 2302 + }, + { + "epoch": 0.2929652715939448, + "grad_norm": 1.6590415239334106, + "learning_rate": 9.758372191606612e-07, + "loss": 0.4748, + "mean_token_accuracy": 0.8441811203956604, + "num_tokens": 87988687.0, + "step": 2303 + }, + { + "epoch": 0.2930924818725353, + "grad_norm": 1.89372718334198, + "learning_rate": 9.76261127596439e-07, + "loss": 0.4831, + "mean_token_accuracy": 0.8435032367706299, + "num_tokens": 88018601.0, + "step": 2304 + }, + { + "epoch": 0.2932196921511258, + "grad_norm": 1.5151221752166748, + "learning_rate": 9.76685036032217e-07, + "loss": 0.4194, + "mean_token_accuracy": 0.8630832433700562, + "num_tokens": 88058390.0, + "step": 2305 + }, + { + "epoch": 0.29334690242971634, + "grad_norm": 1.5797090530395508, + "learning_rate": 9.771089444679948e-07, + "loss": 0.425, + "mean_token_accuracy": 0.8555147647857666, + "num_tokens": 88095948.0, + "step": 2306 + }, + { + "epoch": 0.2934741127083068, + "grad_norm": 1.6009914875030518, + "learning_rate": 9.775328529037728e-07, + "loss": 0.4113, + "mean_token_accuracy": 0.8560214042663574, + "num_tokens": 88131715.0, + "step": 2307 + }, + { + "epoch": 0.29360132298689734, + "grad_norm": 1.5164191722869873, + "learning_rate": 9.779567613395507e-07, + "loss": 0.4398, + "mean_token_accuracy": 0.8495064973831177, + "num_tokens": 88171190.0, + "step": 2308 + }, + { + "epoch": 0.29372853326548787, + "grad_norm": 1.48507559299469, + "learning_rate": 9.783806697753285e-07, + "loss": 0.3709, + "mean_token_accuracy": 0.8738687038421631, + "num_tokens": 88209065.0, + "step": 2309 + }, + { + "epoch": 0.29385574354407834, + "grad_norm": 1.5376428365707397, + "learning_rate": 9.788045782111064e-07, + "loss": 0.4313, + "mean_token_accuracy": 0.855410099029541, + "num_tokens": 88249080.0, + "step": 2310 + }, + { + "epoch": 0.29398295382266887, + "grad_norm": 1.6058076620101929, + "learning_rate": 9.792284866468842e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.8418658971786499, + "num_tokens": 88288717.0, + "step": 2311 + }, + { + "epoch": 0.2941101641012594, + "grad_norm": 1.7689787149429321, + "learning_rate": 9.79652395082662e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.8368252515792847, + "num_tokens": 88320797.0, + "step": 2312 + }, + { + "epoch": 0.29423737437984987, + "grad_norm": 1.5108708143234253, + "learning_rate": 9.8007630351844e-07, + "loss": 0.377, + "mean_token_accuracy": 0.8716250658035278, + "num_tokens": 88358752.0, + "step": 2313 + }, + { + "epoch": 0.2943645846584404, + "grad_norm": 1.5255377292633057, + "learning_rate": 9.805002119542178e-07, + "loss": 0.4384, + "mean_token_accuracy": 0.8519148826599121, + "num_tokens": 88398749.0, + "step": 2314 + }, + { + "epoch": 0.2944917949370309, + "grad_norm": 1.5500835180282593, + "learning_rate": 9.809241203899958e-07, + "loss": 0.4047, + "mean_token_accuracy": 0.8605014085769653, + "num_tokens": 88437685.0, + "step": 2315 + }, + { + "epoch": 0.2946190052156214, + "grad_norm": 1.5920649766921997, + "learning_rate": 9.813480288257737e-07, + "loss": 0.4433, + "mean_token_accuracy": 0.8508040904998779, + "num_tokens": 88475970.0, + "step": 2316 + }, + { + "epoch": 0.2947462154942119, + "grad_norm": 1.4652578830718994, + "learning_rate": 9.817719372615515e-07, + "loss": 0.3621, + "mean_token_accuracy": 0.8769574165344238, + "num_tokens": 88514834.0, + "step": 2317 + }, + { + "epoch": 0.29487342577280246, + "grad_norm": 1.443069338798523, + "learning_rate": 9.821958456973294e-07, + "loss": 0.4178, + "mean_token_accuracy": 0.8588129281997681, + "num_tokens": 88560547.0, + "step": 2318 + }, + { + "epoch": 0.29500063605139293, + "grad_norm": 1.5962860584259033, + "learning_rate": 9.826197541331072e-07, + "loss": 0.384, + "mean_token_accuracy": 0.8684040307998657, + "num_tokens": 88595980.0, + "step": 2319 + }, + { + "epoch": 0.29512784632998346, + "grad_norm": 1.5918809175491333, + "learning_rate": 9.83043662568885e-07, + "loss": 0.4274, + "mean_token_accuracy": 0.8561419248580933, + "num_tokens": 88631738.0, + "step": 2320 + }, + { + "epoch": 0.295255056608574, + "grad_norm": 1.409315824508667, + "learning_rate": 9.83467571004663e-07, + "loss": 0.4215, + "mean_token_accuracy": 0.8592346906661987, + "num_tokens": 88677960.0, + "step": 2321 + }, + { + "epoch": 0.29538226688716446, + "grad_norm": 1.4913649559020996, + "learning_rate": 9.838914794404407e-07, + "loss": 0.4113, + "mean_token_accuracy": 0.8587039113044739, + "num_tokens": 88717270.0, + "step": 2322 + }, + { + "epoch": 0.295509477165755, + "grad_norm": 1.4359371662139893, + "learning_rate": 9.843153878762188e-07, + "loss": 0.3949, + "mean_token_accuracy": 0.8665279746055603, + "num_tokens": 88757329.0, + "step": 2323 + }, + { + "epoch": 0.2956366874443455, + "grad_norm": 1.4906280040740967, + "learning_rate": 9.847392963119966e-07, + "loss": 0.4089, + "mean_token_accuracy": 0.8595287203788757, + "num_tokens": 88796631.0, + "step": 2324 + }, + { + "epoch": 0.295763897722936, + "grad_norm": 1.5740734338760376, + "learning_rate": 9.851632047477745e-07, + "loss": 0.4153, + "mean_token_accuracy": 0.8583574295043945, + "num_tokens": 88833983.0, + "step": 2325 + }, + { + "epoch": 0.2958911080015265, + "grad_norm": 1.5872560739517212, + "learning_rate": 9.855871131835523e-07, + "loss": 0.458, + "mean_token_accuracy": 0.8472739458084106, + "num_tokens": 88872959.0, + "step": 2326 + }, + { + "epoch": 0.29601831828011704, + "grad_norm": 1.5223134756088257, + "learning_rate": 9.860110216193302e-07, + "loss": 0.4296, + "mean_token_accuracy": 0.852509081363678, + "num_tokens": 88909387.0, + "step": 2327 + }, + { + "epoch": 0.2961455285587075, + "grad_norm": 1.5906189680099487, + "learning_rate": 9.86434930055108e-07, + "loss": 0.4074, + "mean_token_accuracy": 0.8610033392906189, + "num_tokens": 88946640.0, + "step": 2328 + }, + { + "epoch": 0.29627273883729804, + "grad_norm": 1.5908708572387695, + "learning_rate": 9.868588384908859e-07, + "loss": 0.4147, + "mean_token_accuracy": 0.8609097003936768, + "num_tokens": 88983481.0, + "step": 2329 + }, + { + "epoch": 0.2963999491158886, + "grad_norm": 1.4413894414901733, + "learning_rate": 9.872827469266637e-07, + "loss": 0.421, + "mean_token_accuracy": 0.8571498394012451, + "num_tokens": 89027061.0, + "step": 2330 + }, + { + "epoch": 0.2965271593944791, + "grad_norm": 1.559259057044983, + "learning_rate": 9.877066553624418e-07, + "loss": 0.3917, + "mean_token_accuracy": 0.868238091468811, + "num_tokens": 89061925.0, + "step": 2331 + }, + { + "epoch": 0.2966543696730696, + "grad_norm": 1.4295777082443237, + "learning_rate": 9.881305637982196e-07, + "loss": 0.4133, + "mean_token_accuracy": 0.8620076179504395, + "num_tokens": 89102912.0, + "step": 2332 + }, + { + "epoch": 0.2967815799516601, + "grad_norm": 1.511415958404541, + "learning_rate": 9.885544722339975e-07, + "loss": 0.453, + "mean_token_accuracy": 0.8486990928649902, + "num_tokens": 89142790.0, + "step": 2333 + }, + { + "epoch": 0.29690879023025063, + "grad_norm": 1.6311721801757812, + "learning_rate": 9.889783806697753e-07, + "loss": 0.4297, + "mean_token_accuracy": 0.8566063642501831, + "num_tokens": 89180465.0, + "step": 2334 + }, + { + "epoch": 0.2970360005088411, + "grad_norm": 1.5715112686157227, + "learning_rate": 9.894022891055532e-07, + "loss": 0.422, + "mean_token_accuracy": 0.8554115295410156, + "num_tokens": 89217921.0, + "step": 2335 + }, + { + "epoch": 0.29716321078743163, + "grad_norm": 1.5931216478347778, + "learning_rate": 9.89826197541331e-07, + "loss": 0.423, + "mean_token_accuracy": 0.8600391149520874, + "num_tokens": 89255655.0, + "step": 2336 + }, + { + "epoch": 0.29729042106602216, + "grad_norm": 1.3943454027175903, + "learning_rate": 9.902501059771089e-07, + "loss": 0.3948, + "mean_token_accuracy": 0.868672251701355, + "num_tokens": 89299040.0, + "step": 2337 + }, + { + "epoch": 0.29741763134461263, + "grad_norm": 1.5145035982131958, + "learning_rate": 9.906740144128867e-07, + "loss": 0.3828, + "mean_token_accuracy": 0.8666461706161499, + "num_tokens": 89335226.0, + "step": 2338 + }, + { + "epoch": 0.29754484162320316, + "grad_norm": 1.4371109008789062, + "learning_rate": 9.910979228486648e-07, + "loss": 0.3953, + "mean_token_accuracy": 0.8642072081565857, + "num_tokens": 89373102.0, + "step": 2339 + }, + { + "epoch": 0.2976720519017937, + "grad_norm": 1.6052013635635376, + "learning_rate": 9.915218312844426e-07, + "loss": 0.4577, + "mean_token_accuracy": 0.8447805643081665, + "num_tokens": 89412487.0, + "step": 2340 + }, + { + "epoch": 0.29779926218038416, + "grad_norm": 1.4975038766860962, + "learning_rate": 9.919457397202205e-07, + "loss": 0.4191, + "mean_token_accuracy": 0.8587470650672913, + "num_tokens": 89453394.0, + "step": 2341 + }, + { + "epoch": 0.2979264724589747, + "grad_norm": 1.4376769065856934, + "learning_rate": 9.923696481559983e-07, + "loss": 0.4163, + "mean_token_accuracy": 0.8607428669929504, + "num_tokens": 89494469.0, + "step": 2342 + }, + { + "epoch": 0.2980536827375652, + "grad_norm": 1.6937748193740845, + "learning_rate": 9.927935565917761e-07, + "loss": 0.4087, + "mean_token_accuracy": 0.8600755929946899, + "num_tokens": 89525745.0, + "step": 2343 + }, + { + "epoch": 0.2981808930161557, + "grad_norm": 1.6191869974136353, + "learning_rate": 9.93217465027554e-07, + "loss": 0.3943, + "mean_token_accuracy": 0.8676099181175232, + "num_tokens": 89562013.0, + "step": 2344 + }, + { + "epoch": 0.2983081032947462, + "grad_norm": 1.7011367082595825, + "learning_rate": 9.936413734633318e-07, + "loss": 0.4385, + "mean_token_accuracy": 0.8528293371200562, + "num_tokens": 89597676.0, + "step": 2345 + }, + { + "epoch": 0.29843531357333675, + "grad_norm": 1.6028399467468262, + "learning_rate": 9.940652818991097e-07, + "loss": 0.4477, + "mean_token_accuracy": 0.8496438264846802, + "num_tokens": 89635703.0, + "step": 2346 + }, + { + "epoch": 0.2985625238519272, + "grad_norm": 1.573494553565979, + "learning_rate": 9.944891903348877e-07, + "loss": 0.4478, + "mean_token_accuracy": 0.8519763350486755, + "num_tokens": 89671942.0, + "step": 2347 + }, + { + "epoch": 0.29868973413051775, + "grad_norm": 1.5590665340423584, + "learning_rate": 9.949130987706656e-07, + "loss": 0.4294, + "mean_token_accuracy": 0.8527899980545044, + "num_tokens": 89710878.0, + "step": 2348 + }, + { + "epoch": 0.2988169444091083, + "grad_norm": 1.5598069429397583, + "learning_rate": 9.953370072064432e-07, + "loss": 0.4176, + "mean_token_accuracy": 0.8561602830886841, + "num_tokens": 89750149.0, + "step": 2349 + }, + { + "epoch": 0.29894415468769875, + "grad_norm": 1.7719064950942993, + "learning_rate": 9.957609156422213e-07, + "loss": 0.4526, + "mean_token_accuracy": 0.8474377989768982, + "num_tokens": 89784212.0, + "step": 2350 + }, + { + "epoch": 0.2990713649662893, + "grad_norm": 1.5272926092147827, + "learning_rate": 9.961848240779991e-07, + "loss": 0.4144, + "mean_token_accuracy": 0.85884690284729, + "num_tokens": 89821147.0, + "step": 2351 + }, + { + "epoch": 0.2991985752448798, + "grad_norm": 1.529982566833496, + "learning_rate": 9.96608732513777e-07, + "loss": 0.4262, + "mean_token_accuracy": 0.8577995896339417, + "num_tokens": 89861014.0, + "step": 2352 + }, + { + "epoch": 0.2993257855234703, + "grad_norm": 1.5529958009719849, + "learning_rate": 9.970326409495548e-07, + "loss": 0.4468, + "mean_token_accuracy": 0.8503466248512268, + "num_tokens": 89902095.0, + "step": 2353 + }, + { + "epoch": 0.2994529958020608, + "grad_norm": 1.7395997047424316, + "learning_rate": 9.974565493853327e-07, + "loss": 0.4093, + "mean_token_accuracy": 0.8579106330871582, + "num_tokens": 89935506.0, + "step": 2354 + }, + { + "epoch": 0.29958020608065133, + "grad_norm": 1.5270285606384277, + "learning_rate": 9.978804578211107e-07, + "loss": 0.3928, + "mean_token_accuracy": 0.8691680431365967, + "num_tokens": 89973084.0, + "step": 2355 + }, + { + "epoch": 0.2997074163592418, + "grad_norm": 1.5452907085418701, + "learning_rate": 9.983043662568886e-07, + "loss": 0.4578, + "mean_token_accuracy": 0.8485678434371948, + "num_tokens": 90010734.0, + "step": 2356 + }, + { + "epoch": 0.29983462663783234, + "grad_norm": 1.8379111289978027, + "learning_rate": 9.987282746926662e-07, + "loss": 0.4922, + "mean_token_accuracy": 0.8304129838943481, + "num_tokens": 90040821.0, + "step": 2357 + }, + { + "epoch": 0.29996183691642286, + "grad_norm": 1.6088542938232422, + "learning_rate": 9.991521831284443e-07, + "loss": 0.459, + "mean_token_accuracy": 0.8543812036514282, + "num_tokens": 90080037.0, + "step": 2358 + }, + { + "epoch": 0.30008904719501334, + "grad_norm": 1.6560070514678955, + "learning_rate": 9.995760915642221e-07, + "loss": 0.434, + "mean_token_accuracy": 0.8549200296401978, + "num_tokens": 90115130.0, + "step": 2359 + }, + { + "epoch": 0.30021625747360386, + "grad_norm": 1.5365110635757446, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8589047789573669, + "num_tokens": 90156040.0, + "step": 2360 + }, + { + "epoch": 0.3003434677521944, + "grad_norm": 1.5310947895050049, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8465526103973389, + "num_tokens": 90198157.0, + "step": 2361 + }, + { + "epoch": 0.30047067803078487, + "grad_norm": 1.502692699432373, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8499413132667542, + "num_tokens": 90240023.0, + "step": 2362 + }, + { + "epoch": 0.3005978883093754, + "grad_norm": 1.5607171058654785, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8614866733551025, + "num_tokens": 90275177.0, + "step": 2363 + }, + { + "epoch": 0.3007250985879659, + "grad_norm": 1.400526523590088, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8656071424484253, + "num_tokens": 90316935.0, + "step": 2364 + }, + { + "epoch": 0.3008523088665564, + "grad_norm": 1.5156015157699585, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.872649073600769, + "num_tokens": 90356983.0, + "step": 2365 + }, + { + "epoch": 0.3009795191451469, + "grad_norm": 1.4655464887619019, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8566461801528931, + "num_tokens": 90396202.0, + "step": 2366 + }, + { + "epoch": 0.30110672942373745, + "grad_norm": 1.5869174003601074, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8521183729171753, + "num_tokens": 90433982.0, + "step": 2367 + }, + { + "epoch": 0.3012339397023279, + "grad_norm": 1.6582674980163574, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8449257016181946, + "num_tokens": 90468447.0, + "step": 2368 + }, + { + "epoch": 0.30136114998091845, + "grad_norm": 1.5792285203933716, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8561298847198486, + "num_tokens": 90501875.0, + "step": 2369 + }, + { + "epoch": 0.301488360259509, + "grad_norm": 1.5682859420776367, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8666988611221313, + "num_tokens": 90538323.0, + "step": 2370 + }, + { + "epoch": 0.30161557053809945, + "grad_norm": 1.5922876596450806, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8738343715667725, + "num_tokens": 90574523.0, + "step": 2371 + }, + { + "epoch": 0.30174278081669, + "grad_norm": 1.4919039011001587, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8465064764022827, + "num_tokens": 90613912.0, + "step": 2372 + }, + { + "epoch": 0.3018699910952805, + "grad_norm": 1.4441665410995483, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8512039184570312, + "num_tokens": 90659302.0, + "step": 2373 + }, + { + "epoch": 0.301997201373871, + "grad_norm": 1.6061584949493408, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8465782403945923, + "num_tokens": 90695395.0, + "step": 2374 + }, + { + "epoch": 0.3021244116524615, + "grad_norm": 1.396621584892273, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8740482926368713, + "num_tokens": 90732836.0, + "step": 2375 + }, + { + "epoch": 0.30225162193105204, + "grad_norm": 1.4429465532302856, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8445448279380798, + "num_tokens": 90781688.0, + "step": 2376 + }, + { + "epoch": 0.3023788322096425, + "grad_norm": 1.5473909378051758, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8593720197677612, + "num_tokens": 90818902.0, + "step": 2377 + }, + { + "epoch": 0.30250604248823304, + "grad_norm": 1.4175910949707031, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8670107126235962, + "num_tokens": 90860606.0, + "step": 2378 + }, + { + "epoch": 0.30263325276682357, + "grad_norm": 1.4290533065795898, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8720372915267944, + "num_tokens": 90901441.0, + "step": 2379 + }, + { + "epoch": 0.3027604630454141, + "grad_norm": 1.6997861862182617, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8501777648925781, + "num_tokens": 90937416.0, + "step": 2380 + }, + { + "epoch": 0.30288767332400457, + "grad_norm": 1.5468995571136475, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.855038046836853, + "num_tokens": 90973970.0, + "step": 2381 + }, + { + "epoch": 0.3030148836025951, + "grad_norm": 1.4058014154434204, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8698852062225342, + "num_tokens": 91015390.0, + "step": 2382 + }, + { + "epoch": 0.3031420938811856, + "grad_norm": 1.5989562273025513, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8373799324035645, + "num_tokens": 91050625.0, + "step": 2383 + }, + { + "epoch": 0.3032693041597761, + "grad_norm": 1.5737817287445068, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.843197762966156, + "num_tokens": 91089899.0, + "step": 2384 + }, + { + "epoch": 0.3033965144383666, + "grad_norm": 1.5917695760726929, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8576719760894775, + "num_tokens": 91128375.0, + "step": 2385 + }, + { + "epoch": 0.30352372471695716, + "grad_norm": 1.5368914604187012, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8489377498626709, + "num_tokens": 91169919.0, + "step": 2386 + }, + { + "epoch": 0.30365093499554763, + "grad_norm": 1.6392855644226074, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8574765920639038, + "num_tokens": 91205901.0, + "step": 2387 + }, + { + "epoch": 0.30377814527413816, + "grad_norm": 1.4679054021835327, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8716826438903809, + "num_tokens": 91244604.0, + "step": 2388 + }, + { + "epoch": 0.3039053555527287, + "grad_norm": 1.461267113685608, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8672342300415039, + "num_tokens": 91286497.0, + "step": 2389 + }, + { + "epoch": 0.30403256583131916, + "grad_norm": 1.5521695613861084, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8384164571762085, + "num_tokens": 91328745.0, + "step": 2390 + }, + { + "epoch": 0.3041597761099097, + "grad_norm": 1.5567162036895752, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8604618906974792, + "num_tokens": 91365114.0, + "step": 2391 + }, + { + "epoch": 0.3042869863885002, + "grad_norm": 1.5999547243118286, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8482812643051147, + "num_tokens": 91399609.0, + "step": 2392 + }, + { + "epoch": 0.3044141966670907, + "grad_norm": 1.4755853414535522, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8588334321975708, + "num_tokens": 91442075.0, + "step": 2393 + }, + { + "epoch": 0.3045414069456812, + "grad_norm": 1.6573266983032227, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8551582098007202, + "num_tokens": 91478733.0, + "step": 2394 + }, + { + "epoch": 0.30466861722427174, + "grad_norm": 1.5935603380203247, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8410215377807617, + "num_tokens": 91514121.0, + "step": 2395 + }, + { + "epoch": 0.3047958275028622, + "grad_norm": 1.5201678276062012, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8398114442825317, + "num_tokens": 91554374.0, + "step": 2396 + }, + { + "epoch": 0.30492303778145274, + "grad_norm": 1.482250452041626, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8604337573051453, + "num_tokens": 91594810.0, + "step": 2397 + }, + { + "epoch": 0.30505024806004327, + "grad_norm": 1.6883660554885864, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8573791980743408, + "num_tokens": 91630070.0, + "step": 2398 + }, + { + "epoch": 0.30517745833863374, + "grad_norm": 1.6331431865692139, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8444708585739136, + "num_tokens": 91668405.0, + "step": 2399 + }, + { + "epoch": 0.3053046686172243, + "grad_norm": 1.5520201921463013, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8434310555458069, + "num_tokens": 91706187.0, + "step": 2400 + }, + { + "epoch": 0.3054318788958148, + "grad_norm": 1.4580878019332886, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8616238832473755, + "num_tokens": 91745805.0, + "step": 2401 + }, + { + "epoch": 0.3055590891744053, + "grad_norm": 1.5377098321914673, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8508092164993286, + "num_tokens": 91789835.0, + "step": 2402 + }, + { + "epoch": 0.3056862994529958, + "grad_norm": 1.6041271686553955, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8533253073692322, + "num_tokens": 91823378.0, + "step": 2403 + }, + { + "epoch": 0.30581350973158633, + "grad_norm": 1.4732617139816284, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.845651388168335, + "num_tokens": 91869542.0, + "step": 2404 + }, + { + "epoch": 0.3059407200101768, + "grad_norm": 1.383049726486206, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8747909069061279, + "num_tokens": 91913946.0, + "step": 2405 + }, + { + "epoch": 0.30606793028876733, + "grad_norm": 1.5459023714065552, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8583389520645142, + "num_tokens": 91951326.0, + "step": 2406 + }, + { + "epoch": 0.30619514056735786, + "grad_norm": 1.7736769914627075, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8667224645614624, + "num_tokens": 91984907.0, + "step": 2407 + }, + { + "epoch": 0.30632235084594833, + "grad_norm": 1.6374398469924927, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8482445478439331, + "num_tokens": 92021917.0, + "step": 2408 + }, + { + "epoch": 0.30644956112453886, + "grad_norm": 1.7272472381591797, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8599191904067993, + "num_tokens": 92052458.0, + "step": 2409 + }, + { + "epoch": 0.3065767714031294, + "grad_norm": 1.6419448852539062, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8594715595245361, + "num_tokens": 92089734.0, + "step": 2410 + }, + { + "epoch": 0.30670398168171986, + "grad_norm": 1.4740177392959595, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.847840428352356, + "num_tokens": 92131720.0, + "step": 2411 + }, + { + "epoch": 0.3068311919603104, + "grad_norm": 1.6090608835220337, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8552917242050171, + "num_tokens": 92169072.0, + "step": 2412 + }, + { + "epoch": 0.3069584022389009, + "grad_norm": 1.480303406715393, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.854604959487915, + "num_tokens": 92211441.0, + "step": 2413 + }, + { + "epoch": 0.3070856125174914, + "grad_norm": 1.3769253492355347, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8623239994049072, + "num_tokens": 92258355.0, + "step": 2414 + }, + { + "epoch": 0.3072128227960819, + "grad_norm": 1.6713230609893799, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8650111556053162, + "num_tokens": 92292524.0, + "step": 2415 + }, + { + "epoch": 0.30734003307467245, + "grad_norm": 1.73444402217865, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8508200645446777, + "num_tokens": 92327285.0, + "step": 2416 + }, + { + "epoch": 0.3074672433532629, + "grad_norm": 1.5245054960250854, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8538429737091064, + "num_tokens": 92368342.0, + "step": 2417 + }, + { + "epoch": 0.30759445363185345, + "grad_norm": 1.5237168073654175, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.855621337890625, + "num_tokens": 92410957.0, + "step": 2418 + }, + { + "epoch": 0.307721663910444, + "grad_norm": 1.8610743284225464, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.858031153678894, + "num_tokens": 92442830.0, + "step": 2419 + }, + { + "epoch": 0.30784887418903445, + "grad_norm": 1.5765684843063354, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8544261455535889, + "num_tokens": 92484256.0, + "step": 2420 + }, + { + "epoch": 0.307976084467625, + "grad_norm": 1.5057244300842285, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8616657257080078, + "num_tokens": 92524869.0, + "step": 2421 + }, + { + "epoch": 0.3081032947462155, + "grad_norm": 1.509764313697815, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8584426641464233, + "num_tokens": 92564154.0, + "step": 2422 + }, + { + "epoch": 0.308230505024806, + "grad_norm": 1.5060595273971558, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8661489486694336, + "num_tokens": 92602259.0, + "step": 2423 + }, + { + "epoch": 0.3083577153033965, + "grad_norm": 1.584647297859192, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8564666509628296, + "num_tokens": 92640852.0, + "step": 2424 + }, + { + "epoch": 0.30848492558198704, + "grad_norm": 1.524039626121521, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.857520341873169, + "num_tokens": 92681895.0, + "step": 2425 + }, + { + "epoch": 0.3086121358605775, + "grad_norm": 1.4377610683441162, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8707508444786072, + "num_tokens": 92724614.0, + "step": 2426 + }, + { + "epoch": 0.30873934613916804, + "grad_norm": 1.7053022384643555, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8489493727684021, + "num_tokens": 92760338.0, + "step": 2427 + }, + { + "epoch": 0.30886655641775856, + "grad_norm": 1.5115453004837036, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8561915159225464, + "num_tokens": 92796817.0, + "step": 2428 + }, + { + "epoch": 0.3089937666963491, + "grad_norm": 1.4560656547546387, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8578008413314819, + "num_tokens": 92836883.0, + "step": 2429 + }, + { + "epoch": 0.30912097697493957, + "grad_norm": 1.480063796043396, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8716299533843994, + "num_tokens": 92876037.0, + "step": 2430 + }, + { + "epoch": 0.3092481872535301, + "grad_norm": 1.4804770946502686, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8746402263641357, + "num_tokens": 92911269.0, + "step": 2431 + }, + { + "epoch": 0.3093753975321206, + "grad_norm": 1.419094204902649, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8662239909172058, + "num_tokens": 92953715.0, + "step": 2432 + }, + { + "epoch": 0.3095026078107111, + "grad_norm": 1.5465281009674072, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8562283515930176, + "num_tokens": 92994585.0, + "step": 2433 + }, + { + "epoch": 0.3096298180893016, + "grad_norm": 1.581499695777893, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8668522834777832, + "num_tokens": 93031268.0, + "step": 2434 + }, + { + "epoch": 0.30975702836789215, + "grad_norm": 1.4031190872192383, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8753511905670166, + "num_tokens": 93072315.0, + "step": 2435 + }, + { + "epoch": 0.3098842386464826, + "grad_norm": 1.5029364824295044, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.857555627822876, + "num_tokens": 93113822.0, + "step": 2436 + }, + { + "epoch": 0.31001144892507315, + "grad_norm": 1.731763243675232, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8785965442657471, + "num_tokens": 93146035.0, + "step": 2437 + }, + { + "epoch": 0.3101386592036637, + "grad_norm": 1.6592375040054321, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8547050952911377, + "num_tokens": 93182061.0, + "step": 2438 + }, + { + "epoch": 0.31026586948225415, + "grad_norm": 1.969408392906189, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.874445915222168, + "num_tokens": 93216223.0, + "step": 2439 + }, + { + "epoch": 0.3103930797608447, + "grad_norm": 1.553176999092102, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.865465521812439, + "num_tokens": 93254688.0, + "step": 2440 + }, + { + "epoch": 0.3105202900394352, + "grad_norm": 1.7207456827163696, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8689440488815308, + "num_tokens": 93290928.0, + "step": 2441 + }, + { + "epoch": 0.3106475003180257, + "grad_norm": 1.5660040378570557, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8602613210678101, + "num_tokens": 93329515.0, + "step": 2442 + }, + { + "epoch": 0.3107747105966162, + "grad_norm": 1.5542742013931274, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.867445170879364, + "num_tokens": 93370158.0, + "step": 2443 + }, + { + "epoch": 0.31090192087520674, + "grad_norm": 1.5318435430526733, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8543817400932312, + "num_tokens": 93413952.0, + "step": 2444 + }, + { + "epoch": 0.3110291311537972, + "grad_norm": 1.5022709369659424, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8434398174285889, + "num_tokens": 93457801.0, + "step": 2445 + }, + { + "epoch": 0.31115634143238774, + "grad_norm": 1.518752932548523, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.862949013710022, + "num_tokens": 93495392.0, + "step": 2446 + }, + { + "epoch": 0.31128355171097827, + "grad_norm": 1.7904988527297974, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8559025526046753, + "num_tokens": 93529454.0, + "step": 2447 + }, + { + "epoch": 0.31141076198956874, + "grad_norm": 1.6640409231185913, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8340484499931335, + "num_tokens": 93571406.0, + "step": 2448 + }, + { + "epoch": 0.31153797226815927, + "grad_norm": 1.5475507974624634, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8520268201828003, + "num_tokens": 93612717.0, + "step": 2449 + }, + { + "epoch": 0.3116651825467498, + "grad_norm": 1.68415367603302, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8458169102668762, + "num_tokens": 93647556.0, + "step": 2450 + }, + { + "epoch": 0.31179239282534027, + "grad_norm": 1.5462312698364258, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8657938838005066, + "num_tokens": 93687620.0, + "step": 2451 + }, + { + "epoch": 0.3119196031039308, + "grad_norm": 1.5417886972427368, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8567355871200562, + "num_tokens": 93729012.0, + "step": 2452 + }, + { + "epoch": 0.3120468133825213, + "grad_norm": 1.5344010591506958, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8474757075309753, + "num_tokens": 93768404.0, + "step": 2453 + }, + { + "epoch": 0.3121740236611118, + "grad_norm": 1.459513783454895, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8585383296012878, + "num_tokens": 93809936.0, + "step": 2454 + }, + { + "epoch": 0.3123012339397023, + "grad_norm": 1.4674121141433716, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.855330228805542, + "num_tokens": 93850462.0, + "step": 2455 + }, + { + "epoch": 0.31242844421829286, + "grad_norm": 1.62270188331604, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8491959571838379, + "num_tokens": 93886791.0, + "step": 2456 + }, + { + "epoch": 0.31255565449688333, + "grad_norm": 1.7032947540283203, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8680959939956665, + "num_tokens": 93920180.0, + "step": 2457 + }, + { + "epoch": 0.31268286477547386, + "grad_norm": 1.791021466255188, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.853019118309021, + "num_tokens": 93954636.0, + "step": 2458 + }, + { + "epoch": 0.3128100750540644, + "grad_norm": 1.6094757318496704, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8612211346626282, + "num_tokens": 93988258.0, + "step": 2459 + }, + { + "epoch": 0.31293728533265486, + "grad_norm": 1.4044697284698486, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8533761501312256, + "num_tokens": 94031487.0, + "step": 2460 + }, + { + "epoch": 0.3130644956112454, + "grad_norm": 1.7850526571273804, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8516181707382202, + "num_tokens": 94065239.0, + "step": 2461 + }, + { + "epoch": 0.3131917058898359, + "grad_norm": 1.6022840738296509, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8602375984191895, + "num_tokens": 94103510.0, + "step": 2462 + }, + { + "epoch": 0.3133189161684264, + "grad_norm": 1.6694952249526978, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8548025488853455, + "num_tokens": 94142460.0, + "step": 2463 + }, + { + "epoch": 0.3134461264470169, + "grad_norm": 1.5614746809005737, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8661495447158813, + "num_tokens": 94178955.0, + "step": 2464 + }, + { + "epoch": 0.31357333672560744, + "grad_norm": 1.5373896360397339, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8520563244819641, + "num_tokens": 94223827.0, + "step": 2465 + }, + { + "epoch": 0.3137005470041979, + "grad_norm": 1.6208617687225342, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8401224613189697, + "num_tokens": 94260291.0, + "step": 2466 + }, + { + "epoch": 0.31382775728278844, + "grad_norm": 1.5200896263122559, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8578963875770569, + "num_tokens": 94299106.0, + "step": 2467 + }, + { + "epoch": 0.313954967561379, + "grad_norm": 1.5481486320495605, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.847939133644104, + "num_tokens": 94340015.0, + "step": 2468 + }, + { + "epoch": 0.31408217783996945, + "grad_norm": 1.6481833457946777, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8486742377281189, + "num_tokens": 94376368.0, + "step": 2469 + }, + { + "epoch": 0.31420938811856, + "grad_norm": 1.6038621664047241, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8680357933044434, + "num_tokens": 94412899.0, + "step": 2470 + }, + { + "epoch": 0.3143365983971505, + "grad_norm": 1.5761258602142334, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8583166599273682, + "num_tokens": 94445078.0, + "step": 2471 + }, + { + "epoch": 0.314463808675741, + "grad_norm": 1.4584852457046509, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8457712531089783, + "num_tokens": 94489193.0, + "step": 2472 + }, + { + "epoch": 0.3145910189543315, + "grad_norm": 1.578697919845581, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8635041117668152, + "num_tokens": 94528627.0, + "step": 2473 + }, + { + "epoch": 0.31471822923292203, + "grad_norm": 1.63949453830719, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.857798159122467, + "num_tokens": 94564603.0, + "step": 2474 + }, + { + "epoch": 0.3148454395115125, + "grad_norm": 1.4434102773666382, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8423333168029785, + "num_tokens": 94608994.0, + "step": 2475 + }, + { + "epoch": 0.31497264979010303, + "grad_norm": 1.424954891204834, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8659140467643738, + "num_tokens": 94650525.0, + "step": 2476 + }, + { + "epoch": 0.31509986006869356, + "grad_norm": 1.5989749431610107, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8540815114974976, + "num_tokens": 94687067.0, + "step": 2477 + }, + { + "epoch": 0.31522707034728403, + "grad_norm": 1.7773783206939697, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8493572473526001, + "num_tokens": 94717231.0, + "step": 2478 + }, + { + "epoch": 0.31535428062587456, + "grad_norm": 1.4823601245880127, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8636503219604492, + "num_tokens": 94764261.0, + "step": 2479 + }, + { + "epoch": 0.3154814909044651, + "grad_norm": 1.5691413879394531, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8613889217376709, + "num_tokens": 94798174.0, + "step": 2480 + }, + { + "epoch": 0.3156087011830556, + "grad_norm": 1.292117953300476, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8747215867042542, + "num_tokens": 94847234.0, + "step": 2481 + }, + { + "epoch": 0.3157359114616461, + "grad_norm": 1.466948390007019, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8510792255401611, + "num_tokens": 94889212.0, + "step": 2482 + }, + { + "epoch": 0.3158631217402366, + "grad_norm": 1.4891034364700317, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8434946537017822, + "num_tokens": 94928852.0, + "step": 2483 + }, + { + "epoch": 0.31599033201882715, + "grad_norm": 1.6228902339935303, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8500940799713135, + "num_tokens": 94964161.0, + "step": 2484 + }, + { + "epoch": 0.3161175422974176, + "grad_norm": 1.515305995941162, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8509559631347656, + "num_tokens": 95005514.0, + "step": 2485 + }, + { + "epoch": 0.31624475257600815, + "grad_norm": 1.528826117515564, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8632004261016846, + "num_tokens": 95043801.0, + "step": 2486 + }, + { + "epoch": 0.3163719628545987, + "grad_norm": 1.6638751029968262, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8672502040863037, + "num_tokens": 95078211.0, + "step": 2487 + }, + { + "epoch": 0.31649917313318915, + "grad_norm": 1.6276088953018188, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8380025029182434, + "num_tokens": 95116841.0, + "step": 2488 + }, + { + "epoch": 0.3166263834117797, + "grad_norm": 1.599629521369934, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8478490114212036, + "num_tokens": 95154284.0, + "step": 2489 + }, + { + "epoch": 0.3167535936903702, + "grad_norm": 1.5881595611572266, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8580248355865479, + "num_tokens": 95188919.0, + "step": 2490 + }, + { + "epoch": 0.3168808039689607, + "grad_norm": 1.4850882291793823, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.85650634765625, + "num_tokens": 95231206.0, + "step": 2491 + }, + { + "epoch": 0.3170080142475512, + "grad_norm": 1.3992936611175537, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8552136421203613, + "num_tokens": 95276163.0, + "step": 2492 + }, + { + "epoch": 0.31713522452614173, + "grad_norm": 1.5447849035263062, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8653309345245361, + "num_tokens": 95311024.0, + "step": 2493 + }, + { + "epoch": 0.3172624348047322, + "grad_norm": 1.5319526195526123, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8660409450531006, + "num_tokens": 95346134.0, + "step": 2494 + }, + { + "epoch": 0.31738964508332274, + "grad_norm": 1.6459565162658691, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8331854343414307, + "num_tokens": 95383106.0, + "step": 2495 + }, + { + "epoch": 0.31751685536191326, + "grad_norm": 2.0166819095611572, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8454368114471436, + "num_tokens": 95419210.0, + "step": 2496 + }, + { + "epoch": 0.31764406564050374, + "grad_norm": 1.4078502655029297, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.867159366607666, + "num_tokens": 95460803.0, + "step": 2497 + }, + { + "epoch": 0.31777127591909426, + "grad_norm": 1.4908500909805298, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8681383728981018, + "num_tokens": 95496599.0, + "step": 2498 + }, + { + "epoch": 0.3178984861976848, + "grad_norm": 1.5955584049224854, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8703815937042236, + "num_tokens": 95529974.0, + "step": 2499 + }, + { + "epoch": 0.31802569647627527, + "grad_norm": 1.6503218412399292, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8494753837585449, + "num_tokens": 95567893.0, + "step": 2500 + }, + { + "epoch": 0.3181529067548658, + "grad_norm": 1.6762967109680176, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.846811056137085, + "num_tokens": 95602887.0, + "step": 2501 + }, + { + "epoch": 0.3182801170334563, + "grad_norm": 1.527275800704956, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8519320487976074, + "num_tokens": 95645771.0, + "step": 2502 + }, + { + "epoch": 0.3184073273120468, + "grad_norm": 1.4882980585098267, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8641828298568726, + "num_tokens": 95684035.0, + "step": 2503 + }, + { + "epoch": 0.3185345375906373, + "grad_norm": 1.4181939363479614, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8795286417007446, + "num_tokens": 95722755.0, + "step": 2504 + }, + { + "epoch": 0.31866174786922785, + "grad_norm": 1.6259901523590088, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8340928554534912, + "num_tokens": 95763675.0, + "step": 2505 + }, + { + "epoch": 0.3187889581478183, + "grad_norm": 1.4674246311187744, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.858472466468811, + "num_tokens": 95805596.0, + "step": 2506 + }, + { + "epoch": 0.31891616842640885, + "grad_norm": 1.5799365043640137, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8451550006866455, + "num_tokens": 95845210.0, + "step": 2507 + }, + { + "epoch": 0.3190433787049994, + "grad_norm": 1.588395118713379, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8613671064376831, + "num_tokens": 95886332.0, + "step": 2508 + }, + { + "epoch": 0.31917058898358985, + "grad_norm": 1.5053842067718506, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8400290012359619, + "num_tokens": 95925621.0, + "step": 2509 + }, + { + "epoch": 0.3192977992621804, + "grad_norm": 1.552264928817749, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8502326011657715, + "num_tokens": 95964900.0, + "step": 2510 + }, + { + "epoch": 0.3194250095407709, + "grad_norm": 1.4760932922363281, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8572328686714172, + "num_tokens": 96007582.0, + "step": 2511 + }, + { + "epoch": 0.3195522198193614, + "grad_norm": 1.5079940557479858, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8673152923583984, + "num_tokens": 96044233.0, + "step": 2512 + }, + { + "epoch": 0.3196794300979519, + "grad_norm": 1.5235289335250854, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8730827569961548, + "num_tokens": 96079406.0, + "step": 2513 + }, + { + "epoch": 0.31980664037654244, + "grad_norm": 1.5613493919372559, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8575440049171448, + "num_tokens": 96116030.0, + "step": 2514 + }, + { + "epoch": 0.3199338506551329, + "grad_norm": 1.413534164428711, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8641796112060547, + "num_tokens": 96157106.0, + "step": 2515 + }, + { + "epoch": 0.32006106093372344, + "grad_norm": 1.569697618484497, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8547549247741699, + "num_tokens": 96192317.0, + "step": 2516 + }, + { + "epoch": 0.32018827121231397, + "grad_norm": 1.4585729837417603, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8479857444763184, + "num_tokens": 96234846.0, + "step": 2517 + }, + { + "epoch": 0.32031548149090444, + "grad_norm": 1.6659929752349854, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8660738468170166, + "num_tokens": 96269444.0, + "step": 2518 + }, + { + "epoch": 0.32044269176949497, + "grad_norm": 1.7081389427185059, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.843387246131897, + "num_tokens": 96304690.0, + "step": 2519 + }, + { + "epoch": 0.3205699020480855, + "grad_norm": 1.575685977935791, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8567962646484375, + "num_tokens": 96343285.0, + "step": 2520 + }, + { + "epoch": 0.32069711232667597, + "grad_norm": 1.6216917037963867, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8566849231719971, + "num_tokens": 96379300.0, + "step": 2521 + }, + { + "epoch": 0.3208243226052665, + "grad_norm": 1.516117811203003, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8610821962356567, + "num_tokens": 96417524.0, + "step": 2522 + }, + { + "epoch": 0.320951532883857, + "grad_norm": 1.4105660915374756, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8763303756713867, + "num_tokens": 96459385.0, + "step": 2523 + }, + { + "epoch": 0.3210787431624475, + "grad_norm": 1.3672293424606323, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8746803402900696, + "num_tokens": 96501776.0, + "step": 2524 + }, + { + "epoch": 0.32120595344103803, + "grad_norm": 1.558129072189331, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8715278506278992, + "num_tokens": 96538903.0, + "step": 2525 + }, + { + "epoch": 0.32133316371962856, + "grad_norm": 1.595767617225647, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8476285934448242, + "num_tokens": 96575575.0, + "step": 2526 + }, + { + "epoch": 0.32146037399821903, + "grad_norm": 1.404115915298462, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8713603019714355, + "num_tokens": 96619904.0, + "step": 2527 + }, + { + "epoch": 0.32158758427680956, + "grad_norm": 1.7312732934951782, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8590412139892578, + "num_tokens": 96652614.0, + "step": 2528 + }, + { + "epoch": 0.3217147945554001, + "grad_norm": 1.4265820980072021, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8659003973007202, + "num_tokens": 96690743.0, + "step": 2529 + }, + { + "epoch": 0.3218420048339906, + "grad_norm": 1.4897280931472778, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8426169753074646, + "num_tokens": 96730631.0, + "step": 2530 + }, + { + "epoch": 0.3219692151125811, + "grad_norm": 1.5112125873565674, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8654294013977051, + "num_tokens": 96765440.0, + "step": 2531 + }, + { + "epoch": 0.3220964253911716, + "grad_norm": 1.5390492677688599, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8456677198410034, + "num_tokens": 96807510.0, + "step": 2532 + }, + { + "epoch": 0.32222363566976214, + "grad_norm": 1.5763847827911377, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8464258909225464, + "num_tokens": 96845661.0, + "step": 2533 + }, + { + "epoch": 0.3223508459483526, + "grad_norm": 1.5279825925827026, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8624451160430908, + "num_tokens": 96880776.0, + "step": 2534 + }, + { + "epoch": 0.32247805622694314, + "grad_norm": 1.5267575979232788, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8611892461776733, + "num_tokens": 96917632.0, + "step": 2535 + }, + { + "epoch": 0.32260526650553367, + "grad_norm": 1.4339888095855713, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.832021951675415, + "num_tokens": 96964028.0, + "step": 2536 + }, + { + "epoch": 0.32273247678412414, + "grad_norm": 1.588261604309082, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8501042723655701, + "num_tokens": 97001634.0, + "step": 2537 + }, + { + "epoch": 0.3228596870627147, + "grad_norm": 1.5971243381500244, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8478051424026489, + "num_tokens": 97037083.0, + "step": 2538 + }, + { + "epoch": 0.3229868973413052, + "grad_norm": 1.5180683135986328, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8550546765327454, + "num_tokens": 97076052.0, + "step": 2539 + }, + { + "epoch": 0.3231141076198957, + "grad_norm": 1.5133534669876099, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8683216571807861, + "num_tokens": 97111989.0, + "step": 2540 + }, + { + "epoch": 0.3232413178984862, + "grad_norm": 1.4966613054275513, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8541393280029297, + "num_tokens": 97153748.0, + "step": 2541 + }, + { + "epoch": 0.32336852817707673, + "grad_norm": 1.436049461364746, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8771806955337524, + "num_tokens": 97194263.0, + "step": 2542 + }, + { + "epoch": 0.3234957384556672, + "grad_norm": 1.459043264389038, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8708534240722656, + "num_tokens": 97234517.0, + "step": 2543 + }, + { + "epoch": 0.32362294873425773, + "grad_norm": 1.475420355796814, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8628950119018555, + "num_tokens": 97274177.0, + "step": 2544 + }, + { + "epoch": 0.32375015901284826, + "grad_norm": 1.4014064073562622, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8522461652755737, + "num_tokens": 97319139.0, + "step": 2545 + }, + { + "epoch": 0.32387736929143873, + "grad_norm": 1.6509474515914917, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8515313863754272, + "num_tokens": 97352621.0, + "step": 2546 + }, + { + "epoch": 0.32400457957002926, + "grad_norm": 1.4746242761611938, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8574904203414917, + "num_tokens": 97395353.0, + "step": 2547 + }, + { + "epoch": 0.3241317898486198, + "grad_norm": 1.549045205116272, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8471014499664307, + "num_tokens": 97437064.0, + "step": 2548 + }, + { + "epoch": 0.32425900012721026, + "grad_norm": 1.5265636444091797, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8520632982254028, + "num_tokens": 97478152.0, + "step": 2549 + }, + { + "epoch": 0.3243862104058008, + "grad_norm": 1.5030183792114258, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8458691239356995, + "num_tokens": 97520595.0, + "step": 2550 + }, + { + "epoch": 0.3245134206843913, + "grad_norm": 1.3815271854400635, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.860281229019165, + "num_tokens": 97564904.0, + "step": 2551 + }, + { + "epoch": 0.3246406309629818, + "grad_norm": 1.493666172027588, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8586881160736084, + "num_tokens": 97604777.0, + "step": 2552 + }, + { + "epoch": 0.3247678412415723, + "grad_norm": 1.610288143157959, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8567759394645691, + "num_tokens": 97643675.0, + "step": 2553 + }, + { + "epoch": 0.32489505152016285, + "grad_norm": 1.5717705488204956, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8692964911460876, + "num_tokens": 97679330.0, + "step": 2554 + }, + { + "epoch": 0.3250222617987533, + "grad_norm": 1.5472941398620605, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8634533286094666, + "num_tokens": 97715937.0, + "step": 2555 + }, + { + "epoch": 0.32514947207734385, + "grad_norm": 1.6703457832336426, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8347613215446472, + "num_tokens": 97752624.0, + "step": 2556 + }, + { + "epoch": 0.3252766823559344, + "grad_norm": 1.451162338256836, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.850655198097229, + "num_tokens": 97794914.0, + "step": 2557 + }, + { + "epoch": 0.32540389263452485, + "grad_norm": 1.4687049388885498, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8601713180541992, + "num_tokens": 97838546.0, + "step": 2558 + }, + { + "epoch": 0.3255311029131154, + "grad_norm": 1.557618498802185, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8502611517906189, + "num_tokens": 97879251.0, + "step": 2559 + }, + { + "epoch": 0.3256583131917059, + "grad_norm": 1.564562439918518, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8496440052986145, + "num_tokens": 97914659.0, + "step": 2560 + }, + { + "epoch": 0.3257855234702964, + "grad_norm": 1.5545103549957275, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8627272844314575, + "num_tokens": 97954500.0, + "step": 2561 + }, + { + "epoch": 0.3259127337488869, + "grad_norm": 1.6033616065979004, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8614067435264587, + "num_tokens": 97989321.0, + "step": 2562 + }, + { + "epoch": 0.32603994402747744, + "grad_norm": 1.5245766639709473, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8736827373504639, + "num_tokens": 98023238.0, + "step": 2563 + }, + { + "epoch": 0.3261671543060679, + "grad_norm": 1.6187885999679565, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8411462306976318, + "num_tokens": 98061576.0, + "step": 2564 + }, + { + "epoch": 0.32629436458465844, + "grad_norm": 1.5955051183700562, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8718388676643372, + "num_tokens": 98094672.0, + "step": 2565 + }, + { + "epoch": 0.32642157486324896, + "grad_norm": 1.5652562379837036, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8554145097732544, + "num_tokens": 98134945.0, + "step": 2566 + }, + { + "epoch": 0.32654878514183944, + "grad_norm": 1.4925018548965454, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8564201593399048, + "num_tokens": 98172916.0, + "step": 2567 + }, + { + "epoch": 0.32667599542042997, + "grad_norm": 1.4776266813278198, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8795291185379028, + "num_tokens": 98210835.0, + "step": 2568 + }, + { + "epoch": 0.3268032056990205, + "grad_norm": 1.6537472009658813, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8589358925819397, + "num_tokens": 98244127.0, + "step": 2569 + }, + { + "epoch": 0.32693041597761097, + "grad_norm": 1.4769731760025024, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8572202920913696, + "num_tokens": 98286249.0, + "step": 2570 + }, + { + "epoch": 0.3270576262562015, + "grad_norm": 1.5805543661117554, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8589426279067993, + "num_tokens": 98320395.0, + "step": 2571 + }, + { + "epoch": 0.327184836534792, + "grad_norm": 1.5757780075073242, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8620494604110718, + "num_tokens": 98358920.0, + "step": 2572 + }, + { + "epoch": 0.3273120468133825, + "grad_norm": 1.6185846328735352, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8547905683517456, + "num_tokens": 98392616.0, + "step": 2573 + }, + { + "epoch": 0.327439257091973, + "grad_norm": 1.5080989599227905, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8576560020446777, + "num_tokens": 98429547.0, + "step": 2574 + }, + { + "epoch": 0.32756646737056355, + "grad_norm": 1.5019912719726562, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8398441076278687, + "num_tokens": 98470210.0, + "step": 2575 + }, + { + "epoch": 0.327693677649154, + "grad_norm": 1.49161958694458, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8643588423728943, + "num_tokens": 98507375.0, + "step": 2576 + }, + { + "epoch": 0.32782088792774455, + "grad_norm": 1.5634585618972778, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8746625185012817, + "num_tokens": 98544177.0, + "step": 2577 + }, + { + "epoch": 0.3279480982063351, + "grad_norm": 1.517533779144287, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8573614358901978, + "num_tokens": 98584096.0, + "step": 2578 + }, + { + "epoch": 0.3280753084849256, + "grad_norm": 1.660321593284607, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8409273624420166, + "num_tokens": 98618507.0, + "step": 2579 + }, + { + "epoch": 0.3282025187635161, + "grad_norm": 1.6378912925720215, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8478853702545166, + "num_tokens": 98654752.0, + "step": 2580 + }, + { + "epoch": 0.3283297290421066, + "grad_norm": 1.5896977186203003, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.842287540435791, + "num_tokens": 98691761.0, + "step": 2581 + }, + { + "epoch": 0.32845693932069714, + "grad_norm": 1.5378577709197998, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8464255332946777, + "num_tokens": 98732660.0, + "step": 2582 + }, + { + "epoch": 0.3285841495992876, + "grad_norm": 1.5276073217391968, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8620200157165527, + "num_tokens": 98770609.0, + "step": 2583 + }, + { + "epoch": 0.32871135987787814, + "grad_norm": 1.4439210891723633, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8579850196838379, + "num_tokens": 98810235.0, + "step": 2584 + }, + { + "epoch": 0.32883857015646867, + "grad_norm": 1.478737473487854, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.860382080078125, + "num_tokens": 98848989.0, + "step": 2585 + }, + { + "epoch": 0.32896578043505914, + "grad_norm": 1.5363261699676514, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8434770107269287, + "num_tokens": 98888467.0, + "step": 2586 + }, + { + "epoch": 0.32909299071364967, + "grad_norm": 1.6629632711410522, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8509995937347412, + "num_tokens": 98924607.0, + "step": 2587 + }, + { + "epoch": 0.3292202009922402, + "grad_norm": 1.5502564907073975, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8524439930915833, + "num_tokens": 98963455.0, + "step": 2588 + }, + { + "epoch": 0.32934741127083067, + "grad_norm": 1.7468836307525635, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8516153693199158, + "num_tokens": 98994900.0, + "step": 2589 + }, + { + "epoch": 0.3294746215494212, + "grad_norm": 1.4717005491256714, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8746253252029419, + "num_tokens": 99032391.0, + "step": 2590 + }, + { + "epoch": 0.3296018318280117, + "grad_norm": 1.7326679229736328, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.855212926864624, + "num_tokens": 99067149.0, + "step": 2591 + }, + { + "epoch": 0.3297290421066022, + "grad_norm": 1.52373206615448, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8644198775291443, + "num_tokens": 99105513.0, + "step": 2592 + }, + { + "epoch": 0.3298562523851927, + "grad_norm": 1.6196651458740234, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8617171049118042, + "num_tokens": 99136925.0, + "step": 2593 + }, + { + "epoch": 0.32998346266378326, + "grad_norm": 1.5815740823745728, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8530794382095337, + "num_tokens": 99175145.0, + "step": 2594 + }, + { + "epoch": 0.33011067294237373, + "grad_norm": 1.6342315673828125, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8588927388191223, + "num_tokens": 99211098.0, + "step": 2595 + }, + { + "epoch": 0.33023788322096426, + "grad_norm": 1.561156153678894, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8608133792877197, + "num_tokens": 99247649.0, + "step": 2596 + }, + { + "epoch": 0.3303650934995548, + "grad_norm": 1.681357741355896, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8639368414878845, + "num_tokens": 99279423.0, + "step": 2597 + }, + { + "epoch": 0.33049230377814526, + "grad_norm": 1.6073832511901855, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8556974530220032, + "num_tokens": 99317262.0, + "step": 2598 + }, + { + "epoch": 0.3306195140567358, + "grad_norm": 1.4846038818359375, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8562086820602417, + "num_tokens": 99358055.0, + "step": 2599 + }, + { + "epoch": 0.3307467243353263, + "grad_norm": 1.509278655052185, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8635387420654297, + "num_tokens": 99395998.0, + "step": 2600 + }, + { + "epoch": 0.3308739346139168, + "grad_norm": 1.4674562215805054, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8718715906143188, + "num_tokens": 99434237.0, + "step": 2601 + }, + { + "epoch": 0.3310011448925073, + "grad_norm": 1.4780651330947876, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8569045066833496, + "num_tokens": 99471203.0, + "step": 2602 + }, + { + "epoch": 0.33112835517109784, + "grad_norm": 1.5503098964691162, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8461512327194214, + "num_tokens": 99507275.0, + "step": 2603 + }, + { + "epoch": 0.3312555654496883, + "grad_norm": 1.6833083629608154, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8618104457855225, + "num_tokens": 99538228.0, + "step": 2604 + }, + { + "epoch": 0.33138277572827884, + "grad_norm": 1.5794183015823364, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8643014430999756, + "num_tokens": 99575086.0, + "step": 2605 + }, + { + "epoch": 0.3315099860068694, + "grad_norm": 1.4876289367675781, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8725196123123169, + "num_tokens": 99616217.0, + "step": 2606 + }, + { + "epoch": 0.33163719628545985, + "grad_norm": 1.5739213228225708, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.851775050163269, + "num_tokens": 99651550.0, + "step": 2607 + }, + { + "epoch": 0.3317644065640504, + "grad_norm": 1.552560567855835, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8533021807670593, + "num_tokens": 99688903.0, + "step": 2608 + }, + { + "epoch": 0.3318916168426409, + "grad_norm": 1.56732177734375, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8384775519371033, + "num_tokens": 99728120.0, + "step": 2609 + }, + { + "epoch": 0.3320188271212314, + "grad_norm": 1.4492154121398926, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8636353015899658, + "num_tokens": 99766561.0, + "step": 2610 + }, + { + "epoch": 0.3321460373998219, + "grad_norm": 1.545648217201233, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8474962711334229, + "num_tokens": 99805342.0, + "step": 2611 + }, + { + "epoch": 0.33227324767841243, + "grad_norm": 1.559521198272705, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8698058128356934, + "num_tokens": 99846589.0, + "step": 2612 + }, + { + "epoch": 0.3324004579570029, + "grad_norm": 1.5456156730651855, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8677129149436951, + "num_tokens": 99883251.0, + "step": 2613 + }, + { + "epoch": 0.33252766823559343, + "grad_norm": 1.6093425750732422, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8421344757080078, + "num_tokens": 99921471.0, + "step": 2614 + }, + { + "epoch": 0.33265487851418396, + "grad_norm": 1.7721760272979736, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8620514869689941, + "num_tokens": 99951065.0, + "step": 2615 + }, + { + "epoch": 0.33278208879277443, + "grad_norm": 1.5809671878814697, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.877429723739624, + "num_tokens": 99987392.0, + "step": 2616 + }, + { + "epoch": 0.33290929907136496, + "grad_norm": 1.5386947393417358, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8592324256896973, + "num_tokens": 100029920.0, + "step": 2617 + }, + { + "epoch": 0.3330365093499555, + "grad_norm": 1.5193307399749756, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8696526288986206, + "num_tokens": 100067858.0, + "step": 2618 + }, + { + "epoch": 0.33316371962854596, + "grad_norm": 1.4790717363357544, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8655968904495239, + "num_tokens": 100109579.0, + "step": 2619 + }, + { + "epoch": 0.3332909299071365, + "grad_norm": 1.648404836654663, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8596874475479126, + "num_tokens": 100146948.0, + "step": 2620 + }, + { + "epoch": 0.333418140185727, + "grad_norm": 1.5507978200912476, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8680103421211243, + "num_tokens": 100180556.0, + "step": 2621 + }, + { + "epoch": 0.3335453504643175, + "grad_norm": 1.5927296876907349, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8690742254257202, + "num_tokens": 100213094.0, + "step": 2622 + }, + { + "epoch": 0.333672560742908, + "grad_norm": 1.4455106258392334, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8698905110359192, + "num_tokens": 100251133.0, + "step": 2623 + }, + { + "epoch": 0.33379977102149855, + "grad_norm": 1.6454689502716064, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8584340810775757, + "num_tokens": 100284800.0, + "step": 2624 + }, + { + "epoch": 0.333926981300089, + "grad_norm": 1.5525511503219604, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.856167197227478, + "num_tokens": 100320477.0, + "step": 2625 + }, + { + "epoch": 0.33405419157867955, + "grad_norm": 1.631937861442566, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8434239625930786, + "num_tokens": 100356422.0, + "step": 2626 + }, + { + "epoch": 0.3341814018572701, + "grad_norm": 1.6478112936019897, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8388460874557495, + "num_tokens": 100391131.0, + "step": 2627 + }, + { + "epoch": 0.33430861213586055, + "grad_norm": 1.4771480560302734, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8564381003379822, + "num_tokens": 100429525.0, + "step": 2628 + }, + { + "epoch": 0.3344358224144511, + "grad_norm": 1.5324949026107788, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8391919732093811, + "num_tokens": 100470035.0, + "step": 2629 + }, + { + "epoch": 0.3345630326930416, + "grad_norm": 1.631418228149414, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8577640056610107, + "num_tokens": 100506096.0, + "step": 2630 + }, + { + "epoch": 0.33469024297163213, + "grad_norm": 1.480171799659729, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8665894865989685, + "num_tokens": 100548661.0, + "step": 2631 + }, + { + "epoch": 0.3348174532502226, + "grad_norm": 1.5717226266860962, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8416099548339844, + "num_tokens": 100586456.0, + "step": 2632 + }, + { + "epoch": 0.33494466352881314, + "grad_norm": 1.439692497253418, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8717151880264282, + "num_tokens": 100623743.0, + "step": 2633 + }, + { + "epoch": 0.33507187380740366, + "grad_norm": 1.6006382703781128, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8553130626678467, + "num_tokens": 100661097.0, + "step": 2634 + }, + { + "epoch": 0.33519908408599414, + "grad_norm": 1.495714783668518, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8695176839828491, + "num_tokens": 100698956.0, + "step": 2635 + }, + { + "epoch": 0.33532629436458466, + "grad_norm": 1.4520354270935059, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8700791597366333, + "num_tokens": 100736831.0, + "step": 2636 + }, + { + "epoch": 0.3354535046431752, + "grad_norm": 1.5342016220092773, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8489263653755188, + "num_tokens": 100773884.0, + "step": 2637 + }, + { + "epoch": 0.33558071492176567, + "grad_norm": 1.5328667163848877, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.864017128944397, + "num_tokens": 100814181.0, + "step": 2638 + }, + { + "epoch": 0.3357079252003562, + "grad_norm": 1.618813157081604, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8518778085708618, + "num_tokens": 100853983.0, + "step": 2639 + }, + { + "epoch": 0.3358351354789467, + "grad_norm": 1.4741045236587524, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8687933683395386, + "num_tokens": 100890179.0, + "step": 2640 + }, + { + "epoch": 0.3359623457575372, + "grad_norm": 1.70415198802948, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8538674712181091, + "num_tokens": 100924924.0, + "step": 2641 + }, + { + "epoch": 0.3360895560361277, + "grad_norm": 1.625009536743164, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.85200434923172, + "num_tokens": 100962534.0, + "step": 2642 + }, + { + "epoch": 0.33621676631471825, + "grad_norm": 1.4517786502838135, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8607743978500366, + "num_tokens": 101002488.0, + "step": 2643 + }, + { + "epoch": 0.3363439765933087, + "grad_norm": 1.3430089950561523, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8607586622238159, + "num_tokens": 101048781.0, + "step": 2644 + }, + { + "epoch": 0.33647118687189925, + "grad_norm": 1.5197343826293945, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.868394136428833, + "num_tokens": 101087836.0, + "step": 2645 + }, + { + "epoch": 0.3365983971504898, + "grad_norm": 1.5594992637634277, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8674217462539673, + "num_tokens": 101122625.0, + "step": 2646 + }, + { + "epoch": 0.33672560742908025, + "grad_norm": 1.618709683418274, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8651447296142578, + "num_tokens": 101153167.0, + "step": 2647 + }, + { + "epoch": 0.3368528177076708, + "grad_norm": 1.4131876230239868, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8684428334236145, + "num_tokens": 101189568.0, + "step": 2648 + }, + { + "epoch": 0.3369800279862613, + "grad_norm": 1.5042771100997925, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8590847849845886, + "num_tokens": 101228636.0, + "step": 2649 + }, + { + "epoch": 0.3371072382648518, + "grad_norm": 1.4555472135543823, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8672361373901367, + "num_tokens": 101269547.0, + "step": 2650 + }, + { + "epoch": 0.3372344485434423, + "grad_norm": 1.408937931060791, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.865557074546814, + "num_tokens": 101310685.0, + "step": 2651 + }, + { + "epoch": 0.33736165882203284, + "grad_norm": 1.5997761487960815, + "learning_rate": 1e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8272451162338257, + "num_tokens": 101351026.0, + "step": 2652 + }, + { + "epoch": 0.3374888691006233, + "grad_norm": 1.6008716821670532, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8550145626068115, + "num_tokens": 101388344.0, + "step": 2653 + }, + { + "epoch": 0.33761607937921384, + "grad_norm": 1.5789494514465332, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8631887435913086, + "num_tokens": 101425494.0, + "step": 2654 + }, + { + "epoch": 0.33774328965780437, + "grad_norm": 1.4417176246643066, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8658593893051147, + "num_tokens": 101466328.0, + "step": 2655 + }, + { + "epoch": 0.33787049993639484, + "grad_norm": 1.488113522529602, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8635010719299316, + "num_tokens": 101508209.0, + "step": 2656 + }, + { + "epoch": 0.33799771021498537, + "grad_norm": 1.7274532318115234, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8447140455245972, + "num_tokens": 101541472.0, + "step": 2657 + }, + { + "epoch": 0.3381249204935759, + "grad_norm": 1.4249705076217651, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8438361287117004, + "num_tokens": 101587582.0, + "step": 2658 + }, + { + "epoch": 0.33825213077216637, + "grad_norm": 1.671066403388977, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.841160237789154, + "num_tokens": 101624173.0, + "step": 2659 + }, + { + "epoch": 0.3383793410507569, + "grad_norm": 1.7121134996414185, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8295713067054749, + "num_tokens": 101658997.0, + "step": 2660 + }, + { + "epoch": 0.3385065513293474, + "grad_norm": 1.768477201461792, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8599694967269897, + "num_tokens": 101690880.0, + "step": 2661 + }, + { + "epoch": 0.3386337616079379, + "grad_norm": 1.382507085800171, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8710783123970032, + "num_tokens": 101732789.0, + "step": 2662 + }, + { + "epoch": 0.33876097188652843, + "grad_norm": 1.5039494037628174, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8528035283088684, + "num_tokens": 101775000.0, + "step": 2663 + }, + { + "epoch": 0.33888818216511896, + "grad_norm": 1.50544273853302, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8499026298522949, + "num_tokens": 101813386.0, + "step": 2664 + }, + { + "epoch": 0.33901539244370943, + "grad_norm": 1.5956114530563354, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8519957065582275, + "num_tokens": 101846390.0, + "step": 2665 + }, + { + "epoch": 0.33914260272229996, + "grad_norm": 1.516423225402832, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8569005727767944, + "num_tokens": 101885307.0, + "step": 2666 + }, + { + "epoch": 0.3392698130008905, + "grad_norm": 1.6398649215698242, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8437683582305908, + "num_tokens": 101923319.0, + "step": 2667 + }, + { + "epoch": 0.33939702327948096, + "grad_norm": 1.5657376050949097, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.849760115146637, + "num_tokens": 101959063.0, + "step": 2668 + }, + { + "epoch": 0.3395242335580715, + "grad_norm": 1.4706238508224487, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8807834982872009, + "num_tokens": 101996900.0, + "step": 2669 + }, + { + "epoch": 0.339651443836662, + "grad_norm": 1.774020791053772, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8448388576507568, + "num_tokens": 102034863.0, + "step": 2670 + }, + { + "epoch": 0.3397786541152525, + "grad_norm": 1.4228302240371704, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8528493642807007, + "num_tokens": 102075527.0, + "step": 2671 + }, + { + "epoch": 0.339905864393843, + "grad_norm": 1.6410478353500366, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8398967981338501, + "num_tokens": 102114926.0, + "step": 2672 + }, + { + "epoch": 0.34003307467243354, + "grad_norm": 1.4315496683120728, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8534256219863892, + "num_tokens": 102158362.0, + "step": 2673 + }, + { + "epoch": 0.340160284951024, + "grad_norm": 1.4537954330444336, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8606008291244507, + "num_tokens": 102201105.0, + "step": 2674 + }, + { + "epoch": 0.34028749522961454, + "grad_norm": 1.5468021631240845, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8779236078262329, + "num_tokens": 102233312.0, + "step": 2675 + }, + { + "epoch": 0.3404147055082051, + "grad_norm": 1.7537262439727783, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8322478532791138, + "num_tokens": 102267342.0, + "step": 2676 + }, + { + "epoch": 0.34054191578679555, + "grad_norm": 1.4088642597198486, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8752264976501465, + "num_tokens": 102304590.0, + "step": 2677 + }, + { + "epoch": 0.3406691260653861, + "grad_norm": 1.4548251628875732, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8672736883163452, + "num_tokens": 102340882.0, + "step": 2678 + }, + { + "epoch": 0.3407963363439766, + "grad_norm": 1.5129204988479614, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8666436672210693, + "num_tokens": 102376445.0, + "step": 2679 + }, + { + "epoch": 0.34092354662256713, + "grad_norm": 1.5314440727233887, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8602021336555481, + "num_tokens": 102413706.0, + "step": 2680 + }, + { + "epoch": 0.3410507569011576, + "grad_norm": 1.5773379802703857, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.84986811876297, + "num_tokens": 102452843.0, + "step": 2681 + }, + { + "epoch": 0.34117796717974813, + "grad_norm": 1.6351187229156494, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8413875102996826, + "num_tokens": 102489094.0, + "step": 2682 + }, + { + "epoch": 0.34130517745833866, + "grad_norm": 1.6799724102020264, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8479911684989929, + "num_tokens": 102525873.0, + "step": 2683 + }, + { + "epoch": 0.34143238773692913, + "grad_norm": 1.6874171495437622, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8545973896980286, + "num_tokens": 102560390.0, + "step": 2684 + }, + { + "epoch": 0.34155959801551966, + "grad_norm": 1.4090468883514404, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8611326217651367, + "num_tokens": 102604455.0, + "step": 2685 + }, + { + "epoch": 0.3416868082941102, + "grad_norm": 1.5888936519622803, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8684829473495483, + "num_tokens": 102641112.0, + "step": 2686 + }, + { + "epoch": 0.34181401857270066, + "grad_norm": 1.5494712591171265, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8567476272583008, + "num_tokens": 102679628.0, + "step": 2687 + }, + { + "epoch": 0.3419412288512912, + "grad_norm": 1.6101317405700684, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8618497848510742, + "num_tokens": 102714385.0, + "step": 2688 + }, + { + "epoch": 0.3420684391298817, + "grad_norm": 1.5948132276535034, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.872061014175415, + "num_tokens": 102751095.0, + "step": 2689 + }, + { + "epoch": 0.3421956494084722, + "grad_norm": 1.6646673679351807, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.86390221118927, + "num_tokens": 102784904.0, + "step": 2690 + }, + { + "epoch": 0.3423228596870627, + "grad_norm": 1.3439990282058716, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8588160872459412, + "num_tokens": 102835232.0, + "step": 2691 + }, + { + "epoch": 0.34245006996565325, + "grad_norm": 1.5477499961853027, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8581730127334595, + "num_tokens": 102872663.0, + "step": 2692 + }, + { + "epoch": 0.3425772802442437, + "grad_norm": 1.683152198791504, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8531473278999329, + "num_tokens": 102908056.0, + "step": 2693 + }, + { + "epoch": 0.34270449052283425, + "grad_norm": 1.7232682704925537, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8711839318275452, + "num_tokens": 102944745.0, + "step": 2694 + }, + { + "epoch": 0.3428317008014248, + "grad_norm": 1.5798590183258057, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8684980869293213, + "num_tokens": 102985200.0, + "step": 2695 + }, + { + "epoch": 0.34295891108001525, + "grad_norm": 1.3931723833084106, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8707784414291382, + "num_tokens": 103029738.0, + "step": 2696 + }, + { + "epoch": 0.3430861213586058, + "grad_norm": 1.6701574325561523, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8415365219116211, + "num_tokens": 103065830.0, + "step": 2697 + }, + { + "epoch": 0.3432133316371963, + "grad_norm": 1.4745137691497803, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8568508625030518, + "num_tokens": 103111778.0, + "step": 2698 + }, + { + "epoch": 0.3433405419157868, + "grad_norm": 1.442419409751892, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8640551567077637, + "num_tokens": 103155477.0, + "step": 2699 + }, + { + "epoch": 0.3434677521943773, + "grad_norm": 1.5240912437438965, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8689740896224976, + "num_tokens": 103194670.0, + "step": 2700 + }, + { + "epoch": 0.34359496247296784, + "grad_norm": 1.6117303371429443, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8684661984443665, + "num_tokens": 103230914.0, + "step": 2701 + }, + { + "epoch": 0.3437221727515583, + "grad_norm": 1.5193395614624023, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8599668145179749, + "num_tokens": 103267826.0, + "step": 2702 + }, + { + "epoch": 0.34384938303014884, + "grad_norm": 1.6093367338180542, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8562633991241455, + "num_tokens": 103301979.0, + "step": 2703 + }, + { + "epoch": 0.34397659330873936, + "grad_norm": 1.4778194427490234, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8370057940483093, + "num_tokens": 103344950.0, + "step": 2704 + }, + { + "epoch": 0.34410380358732984, + "grad_norm": 1.6248854398727417, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8600242733955383, + "num_tokens": 103382356.0, + "step": 2705 + }, + { + "epoch": 0.34423101386592037, + "grad_norm": 1.5247761011123657, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8675116300582886, + "num_tokens": 103421393.0, + "step": 2706 + }, + { + "epoch": 0.3443582241445109, + "grad_norm": 1.4872889518737793, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8645222783088684, + "num_tokens": 103460109.0, + "step": 2707 + }, + { + "epoch": 0.34448543442310137, + "grad_norm": 1.5059895515441895, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8665080070495605, + "num_tokens": 103496715.0, + "step": 2708 + }, + { + "epoch": 0.3446126447016919, + "grad_norm": 1.4566222429275513, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.848792314529419, + "num_tokens": 103539804.0, + "step": 2709 + }, + { + "epoch": 0.3447398549802824, + "grad_norm": 1.4833154678344727, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8525900840759277, + "num_tokens": 103583945.0, + "step": 2710 + }, + { + "epoch": 0.3448670652588729, + "grad_norm": 1.6172869205474854, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8656213879585266, + "num_tokens": 103622391.0, + "step": 2711 + }, + { + "epoch": 0.3449942755374634, + "grad_norm": 1.665074110031128, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8401177525520325, + "num_tokens": 103658889.0, + "step": 2712 + }, + { + "epoch": 0.34512148581605395, + "grad_norm": 1.4108505249023438, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8731644749641418, + "num_tokens": 103697035.0, + "step": 2713 + }, + { + "epoch": 0.3452486960946444, + "grad_norm": 1.5932389497756958, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8523216247558594, + "num_tokens": 103732261.0, + "step": 2714 + }, + { + "epoch": 0.34537590637323495, + "grad_norm": 1.5188089609146118, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8626871109008789, + "num_tokens": 103771266.0, + "step": 2715 + }, + { + "epoch": 0.3455031166518255, + "grad_norm": 1.5177572965621948, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8599673509597778, + "num_tokens": 103810735.0, + "step": 2716 + }, + { + "epoch": 0.34563032693041595, + "grad_norm": 1.4990260601043701, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8491774201393127, + "num_tokens": 103850194.0, + "step": 2717 + }, + { + "epoch": 0.3457575372090065, + "grad_norm": 1.6097486019134521, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8675665855407715, + "num_tokens": 103885770.0, + "step": 2718 + }, + { + "epoch": 0.345884747487597, + "grad_norm": 1.475191593170166, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8537440896034241, + "num_tokens": 103927360.0, + "step": 2719 + }, + { + "epoch": 0.3460119577661875, + "grad_norm": 1.4240368604660034, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8628677725791931, + "num_tokens": 103967618.0, + "step": 2720 + }, + { + "epoch": 0.346139168044778, + "grad_norm": 1.480794072151184, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8423702120780945, + "num_tokens": 104012242.0, + "step": 2721 + }, + { + "epoch": 0.34626637832336854, + "grad_norm": 1.803827166557312, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8442928194999695, + "num_tokens": 104043232.0, + "step": 2722 + }, + { + "epoch": 0.346393588601959, + "grad_norm": 1.4441113471984863, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8609672784805298, + "num_tokens": 104084037.0, + "step": 2723 + }, + { + "epoch": 0.34652079888054954, + "grad_norm": 1.4634368419647217, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8640603423118591, + "num_tokens": 104126213.0, + "step": 2724 + }, + { + "epoch": 0.34664800915914007, + "grad_norm": 1.4889895915985107, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8551410436630249, + "num_tokens": 104169084.0, + "step": 2725 + }, + { + "epoch": 0.34677521943773054, + "grad_norm": 1.50250244140625, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8608263731002808, + "num_tokens": 104206212.0, + "step": 2726 + }, + { + "epoch": 0.34690242971632107, + "grad_norm": 1.5738320350646973, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8657442331314087, + "num_tokens": 104246180.0, + "step": 2727 + }, + { + "epoch": 0.3470296399949116, + "grad_norm": 1.5514955520629883, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8589321374893188, + "num_tokens": 104285627.0, + "step": 2728 + }, + { + "epoch": 0.3471568502735021, + "grad_norm": 1.5425282716751099, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8725231885910034, + "num_tokens": 104319531.0, + "step": 2729 + }, + { + "epoch": 0.3472840605520926, + "grad_norm": 1.464840054512024, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.862740159034729, + "num_tokens": 104357636.0, + "step": 2730 + }, + { + "epoch": 0.3474112708306831, + "grad_norm": 1.5146313905715942, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8624053001403809, + "num_tokens": 104397106.0, + "step": 2731 + }, + { + "epoch": 0.34753848110927366, + "grad_norm": 1.5381914377212524, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8341774940490723, + "num_tokens": 104436758.0, + "step": 2732 + }, + { + "epoch": 0.34766569138786413, + "grad_norm": 1.5308934450149536, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8647520542144775, + "num_tokens": 104473475.0, + "step": 2733 + }, + { + "epoch": 0.34779290166645466, + "grad_norm": 1.8244705200195312, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8463749289512634, + "num_tokens": 104509068.0, + "step": 2734 + }, + { + "epoch": 0.3479201119450452, + "grad_norm": 1.6802401542663574, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8418846130371094, + "num_tokens": 104545766.0, + "step": 2735 + }, + { + "epoch": 0.34804732222363566, + "grad_norm": 1.650266408920288, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8527619242668152, + "num_tokens": 104586403.0, + "step": 2736 + }, + { + "epoch": 0.3481745325022262, + "grad_norm": 1.4663653373718262, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8438621759414673, + "num_tokens": 104629630.0, + "step": 2737 + }, + { + "epoch": 0.3483017427808167, + "grad_norm": 1.4651943445205688, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8575891852378845, + "num_tokens": 104671113.0, + "step": 2738 + }, + { + "epoch": 0.3484289530594072, + "grad_norm": 1.6183247566223145, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8543124198913574, + "num_tokens": 104706287.0, + "step": 2739 + }, + { + "epoch": 0.3485561633379977, + "grad_norm": 1.4283822774887085, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8552835583686829, + "num_tokens": 104748980.0, + "step": 2740 + }, + { + "epoch": 0.34868337361658824, + "grad_norm": 1.490204095840454, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8674049377441406, + "num_tokens": 104786718.0, + "step": 2741 + }, + { + "epoch": 0.3488105838951787, + "grad_norm": 1.7515877485275269, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8435299396514893, + "num_tokens": 104818867.0, + "step": 2742 + }, + { + "epoch": 0.34893779417376924, + "grad_norm": 1.7551743984222412, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8475995063781738, + "num_tokens": 104850782.0, + "step": 2743 + }, + { + "epoch": 0.3490650044523598, + "grad_norm": 1.6874035596847534, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8383904695510864, + "num_tokens": 104887394.0, + "step": 2744 + }, + { + "epoch": 0.34919221473095025, + "grad_norm": 1.4833219051361084, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8643940687179565, + "num_tokens": 104923651.0, + "step": 2745 + }, + { + "epoch": 0.3493194250095408, + "grad_norm": 1.5902870893478394, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8490222096443176, + "num_tokens": 104960860.0, + "step": 2746 + }, + { + "epoch": 0.3494466352881313, + "grad_norm": 1.6320436000823975, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8604981899261475, + "num_tokens": 104996666.0, + "step": 2747 + }, + { + "epoch": 0.3495738455667218, + "grad_norm": 1.3662720918655396, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8688659071922302, + "num_tokens": 105039061.0, + "step": 2748 + }, + { + "epoch": 0.3497010558453123, + "grad_norm": 1.595638394355774, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8441574573516846, + "num_tokens": 105077125.0, + "step": 2749 + }, + { + "epoch": 0.34982826612390283, + "grad_norm": 1.5728144645690918, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8565931916236877, + "num_tokens": 105111236.0, + "step": 2750 + }, + { + "epoch": 0.3499554764024933, + "grad_norm": 1.5862152576446533, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8556917309761047, + "num_tokens": 105147763.0, + "step": 2751 + }, + { + "epoch": 0.35008268668108383, + "grad_norm": 1.5280712842941284, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8549861311912537, + "num_tokens": 105188699.0, + "step": 2752 + }, + { + "epoch": 0.35020989695967436, + "grad_norm": 1.3766226768493652, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8746268153190613, + "num_tokens": 105229847.0, + "step": 2753 + }, + { + "epoch": 0.35033710723826483, + "grad_norm": 1.466742992401123, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8577157258987427, + "num_tokens": 105269286.0, + "step": 2754 + }, + { + "epoch": 0.35046431751685536, + "grad_norm": 1.5366551876068115, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8618311882019043, + "num_tokens": 105308414.0, + "step": 2755 + }, + { + "epoch": 0.3505915277954459, + "grad_norm": 1.5165926218032837, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8536356687545776, + "num_tokens": 105347296.0, + "step": 2756 + }, + { + "epoch": 0.35071873807403636, + "grad_norm": 1.3834223747253418, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8714359998703003, + "num_tokens": 105388701.0, + "step": 2757 + }, + { + "epoch": 0.3508459483526269, + "grad_norm": 1.625290870666504, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.854122519493103, + "num_tokens": 105421865.0, + "step": 2758 + }, + { + "epoch": 0.3509731586312174, + "grad_norm": 1.5644060373306274, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8570241928100586, + "num_tokens": 105457653.0, + "step": 2759 + }, + { + "epoch": 0.3511003689098079, + "grad_norm": 1.5027450323104858, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8576166033744812, + "num_tokens": 105497925.0, + "step": 2760 + }, + { + "epoch": 0.3512275791883984, + "grad_norm": 1.6277934312820435, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8644481897354126, + "num_tokens": 105533685.0, + "step": 2761 + }, + { + "epoch": 0.35135478946698895, + "grad_norm": 1.7308597564697266, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8506201505661011, + "num_tokens": 105565036.0, + "step": 2762 + }, + { + "epoch": 0.3514819997455794, + "grad_norm": 1.8917416334152222, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8498895764350891, + "num_tokens": 105609815.0, + "step": 2763 + }, + { + "epoch": 0.35160921002416995, + "grad_norm": 1.701931118965149, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8413480520248413, + "num_tokens": 105643748.0, + "step": 2764 + }, + { + "epoch": 0.3517364203027605, + "grad_norm": 1.5379074811935425, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8702202439308167, + "num_tokens": 105684607.0, + "step": 2765 + }, + { + "epoch": 0.35186363058135095, + "grad_norm": 1.5051177740097046, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8453248739242554, + "num_tokens": 105723834.0, + "step": 2766 + }, + { + "epoch": 0.3519908408599415, + "grad_norm": 1.4464433193206787, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8575270771980286, + "num_tokens": 105765592.0, + "step": 2767 + }, + { + "epoch": 0.352118051138532, + "grad_norm": 1.4096448421478271, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.863275945186615, + "num_tokens": 105808292.0, + "step": 2768 + }, + { + "epoch": 0.3522452614171225, + "grad_norm": 1.5204389095306396, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8636393547058105, + "num_tokens": 105847842.0, + "step": 2769 + }, + { + "epoch": 0.352372471695713, + "grad_norm": 1.4997565746307373, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.863986611366272, + "num_tokens": 105890302.0, + "step": 2770 + }, + { + "epoch": 0.35249968197430354, + "grad_norm": 1.602881908416748, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8543729186058044, + "num_tokens": 105924581.0, + "step": 2771 + }, + { + "epoch": 0.352626892252894, + "grad_norm": 1.462809681892395, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8768303990364075, + "num_tokens": 105962598.0, + "step": 2772 + }, + { + "epoch": 0.35275410253148454, + "grad_norm": 1.421877384185791, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8771557807922363, + "num_tokens": 106002473.0, + "step": 2773 + }, + { + "epoch": 0.35288131281007507, + "grad_norm": 1.4288647174835205, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8719753623008728, + "num_tokens": 106039400.0, + "step": 2774 + }, + { + "epoch": 0.35300852308866554, + "grad_norm": 1.6368966102600098, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8728485703468323, + "num_tokens": 106074550.0, + "step": 2775 + }, + { + "epoch": 0.35313573336725607, + "grad_norm": 1.4800310134887695, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8691253662109375, + "num_tokens": 106114028.0, + "step": 2776 + }, + { + "epoch": 0.3532629436458466, + "grad_norm": 1.4649051427841187, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8688227534294128, + "num_tokens": 106153182.0, + "step": 2777 + }, + { + "epoch": 0.35339015392443707, + "grad_norm": 1.4695594310760498, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8823354840278625, + "num_tokens": 106190542.0, + "step": 2778 + }, + { + "epoch": 0.3535173642030276, + "grad_norm": 1.4871478080749512, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8580418825149536, + "num_tokens": 106230218.0, + "step": 2779 + }, + { + "epoch": 0.3536445744816181, + "grad_norm": 1.5751855373382568, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8578331470489502, + "num_tokens": 106268898.0, + "step": 2780 + }, + { + "epoch": 0.35377178476020865, + "grad_norm": 1.6226812601089478, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8394352197647095, + "num_tokens": 106309125.0, + "step": 2781 + }, + { + "epoch": 0.3538989950387991, + "grad_norm": 1.6343237161636353, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8491724729537964, + "num_tokens": 106343873.0, + "step": 2782 + }, + { + "epoch": 0.35402620531738965, + "grad_norm": 1.6590543985366821, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8485234975814819, + "num_tokens": 106378255.0, + "step": 2783 + }, + { + "epoch": 0.3541534155959802, + "grad_norm": 1.5727781057357788, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8654181957244873, + "num_tokens": 106415583.0, + "step": 2784 + }, + { + "epoch": 0.35428062587457065, + "grad_norm": 1.5861736536026, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8600791692733765, + "num_tokens": 106452533.0, + "step": 2785 + }, + { + "epoch": 0.3544078361531612, + "grad_norm": 1.5376776456832886, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8614461421966553, + "num_tokens": 106489628.0, + "step": 2786 + }, + { + "epoch": 0.3545350464317517, + "grad_norm": 1.457439661026001, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8645269870758057, + "num_tokens": 106532257.0, + "step": 2787 + }, + { + "epoch": 0.3546622567103422, + "grad_norm": 1.6259602308273315, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8560861349105835, + "num_tokens": 106569445.0, + "step": 2788 + }, + { + "epoch": 0.3547894669889327, + "grad_norm": 1.4799764156341553, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8577903509140015, + "num_tokens": 106609980.0, + "step": 2789 + }, + { + "epoch": 0.35491667726752324, + "grad_norm": 1.4262293577194214, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8535681366920471, + "num_tokens": 106655390.0, + "step": 2790 + }, + { + "epoch": 0.3550438875461137, + "grad_norm": 1.5689644813537598, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8551876544952393, + "num_tokens": 106692711.0, + "step": 2791 + }, + { + "epoch": 0.35517109782470424, + "grad_norm": 1.513137698173523, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8646117448806763, + "num_tokens": 106733118.0, + "step": 2792 + }, + { + "epoch": 0.35529830810329477, + "grad_norm": 1.5029504299163818, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8774387240409851, + "num_tokens": 106769806.0, + "step": 2793 + }, + { + "epoch": 0.35542551838188524, + "grad_norm": 1.460610270500183, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8430514335632324, + "num_tokens": 106815314.0, + "step": 2794 + }, + { + "epoch": 0.35555272866047577, + "grad_norm": 1.5109962224960327, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.858549952507019, + "num_tokens": 106853363.0, + "step": 2795 + }, + { + "epoch": 0.3556799389390663, + "grad_norm": 1.5944432020187378, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8780324459075928, + "num_tokens": 106888676.0, + "step": 2796 + }, + { + "epoch": 0.35580714921765677, + "grad_norm": 1.4805560111999512, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8557640314102173, + "num_tokens": 106931761.0, + "step": 2797 + }, + { + "epoch": 0.3559343594962473, + "grad_norm": 1.4431064128875732, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.851078450679779, + "num_tokens": 106973178.0, + "step": 2798 + }, + { + "epoch": 0.3560615697748378, + "grad_norm": 1.5777300596237183, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8568872809410095, + "num_tokens": 107010158.0, + "step": 2799 + }, + { + "epoch": 0.3561887800534283, + "grad_norm": 1.594843864440918, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8534039258956909, + "num_tokens": 107048337.0, + "step": 2800 + }, + { + "epoch": 0.35631599033201883, + "grad_norm": 1.4544073343276978, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8522529006004333, + "num_tokens": 107088004.0, + "step": 2801 + }, + { + "epoch": 0.35644320061060936, + "grad_norm": 1.5343999862670898, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8767592310905457, + "num_tokens": 107124993.0, + "step": 2802 + }, + { + "epoch": 0.35657041088919983, + "grad_norm": 1.5713177919387817, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8483255505561829, + "num_tokens": 107168447.0, + "step": 2803 + }, + { + "epoch": 0.35669762116779036, + "grad_norm": 1.48140287399292, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8601368069648743, + "num_tokens": 107212124.0, + "step": 2804 + }, + { + "epoch": 0.3568248314463809, + "grad_norm": 1.5976060628890991, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8525513410568237, + "num_tokens": 107247945.0, + "step": 2805 + }, + { + "epoch": 0.35695204172497136, + "grad_norm": 1.6943916082382202, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8445494174957275, + "num_tokens": 107280945.0, + "step": 2806 + }, + { + "epoch": 0.3570792520035619, + "grad_norm": 1.471248984336853, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8605870008468628, + "num_tokens": 107320318.0, + "step": 2807 + }, + { + "epoch": 0.3572064622821524, + "grad_norm": 1.4392931461334229, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8618583679199219, + "num_tokens": 107361961.0, + "step": 2808 + }, + { + "epoch": 0.3573336725607429, + "grad_norm": 1.506331443786621, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8704935312271118, + "num_tokens": 107396584.0, + "step": 2809 + }, + { + "epoch": 0.3574608828393334, + "grad_norm": 1.578392744064331, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8652706146240234, + "num_tokens": 107432418.0, + "step": 2810 + }, + { + "epoch": 0.35758809311792394, + "grad_norm": 1.4695682525634766, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8612899780273438, + "num_tokens": 107473717.0, + "step": 2811 + }, + { + "epoch": 0.3577153033965144, + "grad_norm": 1.5680373907089233, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8527999520301819, + "num_tokens": 107513187.0, + "step": 2812 + }, + { + "epoch": 0.35784251367510495, + "grad_norm": 1.6113721132278442, + "learning_rate": 1e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8295434713363647, + "num_tokens": 107554290.0, + "step": 2813 + }, + { + "epoch": 0.3579697239536955, + "grad_norm": 1.602033019065857, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8553536534309387, + "num_tokens": 107594771.0, + "step": 2814 + }, + { + "epoch": 0.35809693423228595, + "grad_norm": 1.5188515186309814, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8708001971244812, + "num_tokens": 107630818.0, + "step": 2815 + }, + { + "epoch": 0.3582241445108765, + "grad_norm": 1.6218963861465454, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8659170866012573, + "num_tokens": 107666318.0, + "step": 2816 + }, + { + "epoch": 0.358351354789467, + "grad_norm": 1.6136629581451416, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8731558918952942, + "num_tokens": 107701996.0, + "step": 2817 + }, + { + "epoch": 0.3584785650680575, + "grad_norm": 1.6567126512527466, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8545185327529907, + "num_tokens": 107733551.0, + "step": 2818 + }, + { + "epoch": 0.358605775346648, + "grad_norm": 1.593337893486023, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.855629026889801, + "num_tokens": 107770653.0, + "step": 2819 + }, + { + "epoch": 0.35873298562523853, + "grad_norm": 1.4634616374969482, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8492120504379272, + "num_tokens": 107816470.0, + "step": 2820 + }, + { + "epoch": 0.358860195903829, + "grad_norm": 1.572690725326538, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8677793741226196, + "num_tokens": 107849856.0, + "step": 2821 + }, + { + "epoch": 0.35898740618241953, + "grad_norm": 1.5344072580337524, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8555965423583984, + "num_tokens": 107886898.0, + "step": 2822 + }, + { + "epoch": 0.35911461646101006, + "grad_norm": 1.7094224691390991, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.842994213104248, + "num_tokens": 107920257.0, + "step": 2823 + }, + { + "epoch": 0.35924182673960053, + "grad_norm": 1.5306936502456665, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8571996688842773, + "num_tokens": 107955085.0, + "step": 2824 + }, + { + "epoch": 0.35936903701819106, + "grad_norm": 1.4560449123382568, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8626922965049744, + "num_tokens": 107993576.0, + "step": 2825 + }, + { + "epoch": 0.3594962472967816, + "grad_norm": 1.5785562992095947, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.852558434009552, + "num_tokens": 108031661.0, + "step": 2826 + }, + { + "epoch": 0.35962345757537206, + "grad_norm": 1.5041714906692505, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8665122985839844, + "num_tokens": 108067701.0, + "step": 2827 + }, + { + "epoch": 0.3597506678539626, + "grad_norm": 1.6054480075836182, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8423113822937012, + "num_tokens": 108106880.0, + "step": 2828 + }, + { + "epoch": 0.3598778781325531, + "grad_norm": 1.3239866495132446, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8685832619667053, + "num_tokens": 108150777.0, + "step": 2829 + }, + { + "epoch": 0.36000508841114365, + "grad_norm": 1.4180346727371216, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8618918657302856, + "num_tokens": 108192240.0, + "step": 2830 + }, + { + "epoch": 0.3601322986897341, + "grad_norm": 1.4911112785339355, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8679395318031311, + "num_tokens": 108229716.0, + "step": 2831 + }, + { + "epoch": 0.36025950896832465, + "grad_norm": 1.5105971097946167, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.854468822479248, + "num_tokens": 108269599.0, + "step": 2832 + }, + { + "epoch": 0.3603867192469152, + "grad_norm": 1.6831177473068237, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8650250434875488, + "num_tokens": 108303799.0, + "step": 2833 + }, + { + "epoch": 0.36051392952550565, + "grad_norm": 1.8916676044464111, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8604517579078674, + "num_tokens": 108341003.0, + "step": 2834 + }, + { + "epoch": 0.3606411398040962, + "grad_norm": 1.4349365234375, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8670312166213989, + "num_tokens": 108385567.0, + "step": 2835 + }, + { + "epoch": 0.3607683500826867, + "grad_norm": 1.475944995880127, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8732984066009521, + "num_tokens": 108424847.0, + "step": 2836 + }, + { + "epoch": 0.3608955603612772, + "grad_norm": 1.6747957468032837, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8604239821434021, + "num_tokens": 108456489.0, + "step": 2837 + }, + { + "epoch": 0.3610227706398677, + "grad_norm": 1.4997655153274536, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8722836375236511, + "num_tokens": 108492777.0, + "step": 2838 + }, + { + "epoch": 0.36114998091845824, + "grad_norm": 1.596252679824829, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8446354866027832, + "num_tokens": 108529570.0, + "step": 2839 + }, + { + "epoch": 0.3612771911970487, + "grad_norm": 1.549483299255371, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8646947145462036, + "num_tokens": 108565236.0, + "step": 2840 + }, + { + "epoch": 0.36140440147563924, + "grad_norm": 1.4264676570892334, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8648357391357422, + "num_tokens": 108608319.0, + "step": 2841 + }, + { + "epoch": 0.36153161175422976, + "grad_norm": 1.5374467372894287, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.858356773853302, + "num_tokens": 108650087.0, + "step": 2842 + }, + { + "epoch": 0.36165882203282024, + "grad_norm": 1.5445818901062012, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8463526964187622, + "num_tokens": 108688438.0, + "step": 2843 + }, + { + "epoch": 0.36178603231141077, + "grad_norm": 1.456545114517212, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8513575792312622, + "num_tokens": 108728771.0, + "step": 2844 + }, + { + "epoch": 0.3619132425900013, + "grad_norm": 1.6540729999542236, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.847227931022644, + "num_tokens": 108765038.0, + "step": 2845 + }, + { + "epoch": 0.36204045286859177, + "grad_norm": 1.638134241104126, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8637464046478271, + "num_tokens": 108804247.0, + "step": 2846 + }, + { + "epoch": 0.3621676631471823, + "grad_norm": 1.6832255125045776, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8521892428398132, + "num_tokens": 108839684.0, + "step": 2847 + }, + { + "epoch": 0.3622948734257728, + "grad_norm": 1.4781877994537354, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8679096698760986, + "num_tokens": 108880264.0, + "step": 2848 + }, + { + "epoch": 0.3624220837043633, + "grad_norm": 1.623181700706482, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8622928857803345, + "num_tokens": 108918034.0, + "step": 2849 + }, + { + "epoch": 0.3625492939829538, + "grad_norm": 1.5635350942611694, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8618971705436707, + "num_tokens": 108954616.0, + "step": 2850 + }, + { + "epoch": 0.36267650426154435, + "grad_norm": 1.5558711290359497, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8589432239532471, + "num_tokens": 108993060.0, + "step": 2851 + }, + { + "epoch": 0.3628037145401348, + "grad_norm": 1.4347624778747559, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8724935054779053, + "num_tokens": 109033050.0, + "step": 2852 + }, + { + "epoch": 0.36293092481872535, + "grad_norm": 1.468799352645874, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8560065031051636, + "num_tokens": 109075261.0, + "step": 2853 + }, + { + "epoch": 0.3630581350973159, + "grad_norm": 1.4476392269134521, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8738925457000732, + "num_tokens": 109117287.0, + "step": 2854 + }, + { + "epoch": 0.36318534537590635, + "grad_norm": 1.5072176456451416, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8534568548202515, + "num_tokens": 109159274.0, + "step": 2855 + }, + { + "epoch": 0.3633125556544969, + "grad_norm": 1.4581727981567383, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8638215065002441, + "num_tokens": 109200959.0, + "step": 2856 + }, + { + "epoch": 0.3634397659330874, + "grad_norm": 1.397308349609375, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8700780868530273, + "num_tokens": 109247891.0, + "step": 2857 + }, + { + "epoch": 0.3635669762116779, + "grad_norm": 1.5773950815200806, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.845810055732727, + "num_tokens": 109283209.0, + "step": 2858 + }, + { + "epoch": 0.3636941864902684, + "grad_norm": 1.5569406747817993, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8361040949821472, + "num_tokens": 109321495.0, + "step": 2859 + }, + { + "epoch": 0.36382139676885894, + "grad_norm": 1.6883082389831543, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8592589497566223, + "num_tokens": 109359125.0, + "step": 2860 + }, + { + "epoch": 0.3639486070474494, + "grad_norm": 1.4884586334228516, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8683565855026245, + "num_tokens": 109395471.0, + "step": 2861 + }, + { + "epoch": 0.36407581732603994, + "grad_norm": 1.5792824029922485, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8478306531906128, + "num_tokens": 109435031.0, + "step": 2862 + }, + { + "epoch": 0.36420302760463047, + "grad_norm": 1.4920936822891235, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8669400215148926, + "num_tokens": 109472633.0, + "step": 2863 + }, + { + "epoch": 0.36433023788322094, + "grad_norm": 1.6178538799285889, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8508675694465637, + "num_tokens": 109508069.0, + "step": 2864 + }, + { + "epoch": 0.36445744816181147, + "grad_norm": 1.397236943244934, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8703221678733826, + "num_tokens": 109547495.0, + "step": 2865 + }, + { + "epoch": 0.364584658440402, + "grad_norm": 1.5681300163269043, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.849778413772583, + "num_tokens": 109585442.0, + "step": 2866 + }, + { + "epoch": 0.36471186871899247, + "grad_norm": 1.6361141204833984, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8775780200958252, + "num_tokens": 109616008.0, + "step": 2867 + }, + { + "epoch": 0.364839078997583, + "grad_norm": 1.540432333946228, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8604170083999634, + "num_tokens": 109659592.0, + "step": 2868 + }, + { + "epoch": 0.36496628927617353, + "grad_norm": 1.5063502788543701, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8616824150085449, + "num_tokens": 109698467.0, + "step": 2869 + }, + { + "epoch": 0.365093499554764, + "grad_norm": 1.7341407537460327, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.864133358001709, + "num_tokens": 109727439.0, + "step": 2870 + }, + { + "epoch": 0.36522070983335453, + "grad_norm": 1.5728248357772827, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8523756265640259, + "num_tokens": 109766157.0, + "step": 2871 + }, + { + "epoch": 0.36534792011194506, + "grad_norm": 1.4739006757736206, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8665831089019775, + "num_tokens": 109804397.0, + "step": 2872 + }, + { + "epoch": 0.36547513039053553, + "grad_norm": 1.5965263843536377, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8600122928619385, + "num_tokens": 109841216.0, + "step": 2873 + }, + { + "epoch": 0.36560234066912606, + "grad_norm": 1.556425929069519, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8570051789283752, + "num_tokens": 109878306.0, + "step": 2874 + }, + { + "epoch": 0.3657295509477166, + "grad_norm": 1.5337268114089966, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8385193347930908, + "num_tokens": 109923359.0, + "step": 2875 + }, + { + "epoch": 0.36585676122630706, + "grad_norm": 1.4905582666397095, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8583651781082153, + "num_tokens": 109961762.0, + "step": 2876 + }, + { + "epoch": 0.3659839715048976, + "grad_norm": 1.541914463043213, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8536025285720825, + "num_tokens": 109997890.0, + "step": 2877 + }, + { + "epoch": 0.3661111817834881, + "grad_norm": 1.5371747016906738, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8611832857131958, + "num_tokens": 110036344.0, + "step": 2878 + }, + { + "epoch": 0.3662383920620786, + "grad_norm": 1.4504474401474, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8657329678535461, + "num_tokens": 110075590.0, + "step": 2879 + }, + { + "epoch": 0.3663656023406691, + "grad_norm": 1.594002604484558, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8698982000350952, + "num_tokens": 110109146.0, + "step": 2880 + }, + { + "epoch": 0.36649281261925964, + "grad_norm": 1.5219759941101074, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8579619526863098, + "num_tokens": 110148590.0, + "step": 2881 + }, + { + "epoch": 0.3666200228978502, + "grad_norm": 1.4195982217788696, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8638087511062622, + "num_tokens": 110192904.0, + "step": 2882 + }, + { + "epoch": 0.36674723317644065, + "grad_norm": 1.5667675733566284, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8629372715950012, + "num_tokens": 110226189.0, + "step": 2883 + }, + { + "epoch": 0.3668744434550312, + "grad_norm": 1.3634319305419922, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8762037754058838, + "num_tokens": 110268297.0, + "step": 2884 + }, + { + "epoch": 0.3670016537336217, + "grad_norm": 1.605872392654419, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8589128255844116, + "num_tokens": 110305345.0, + "step": 2885 + }, + { + "epoch": 0.3671288640122122, + "grad_norm": 1.562717080116272, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8373502492904663, + "num_tokens": 110346149.0, + "step": 2886 + }, + { + "epoch": 0.3672560742908027, + "grad_norm": 1.5254460573196411, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8643333911895752, + "num_tokens": 110386654.0, + "step": 2887 + }, + { + "epoch": 0.36738328456939323, + "grad_norm": 1.5047798156738281, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8564003705978394, + "num_tokens": 110426521.0, + "step": 2888 + }, + { + "epoch": 0.3675104948479837, + "grad_norm": 1.500434160232544, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8656577467918396, + "num_tokens": 110466147.0, + "step": 2889 + }, + { + "epoch": 0.36763770512657423, + "grad_norm": 1.596276879310608, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8593445420265198, + "num_tokens": 110508557.0, + "step": 2890 + }, + { + "epoch": 0.36776491540516476, + "grad_norm": 1.408549427986145, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8616358637809753, + "num_tokens": 110553067.0, + "step": 2891 + }, + { + "epoch": 0.36789212568375523, + "grad_norm": 1.4966331720352173, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8376883268356323, + "num_tokens": 110599503.0, + "step": 2892 + }, + { + "epoch": 0.36801933596234576, + "grad_norm": 1.5342484712600708, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8434894680976868, + "num_tokens": 110637099.0, + "step": 2893 + }, + { + "epoch": 0.3681465462409363, + "grad_norm": 1.7375304698944092, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8518827557563782, + "num_tokens": 110667054.0, + "step": 2894 + }, + { + "epoch": 0.36827375651952676, + "grad_norm": 1.51554536819458, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8684052228927612, + "num_tokens": 110703755.0, + "step": 2895 + }, + { + "epoch": 0.3684009667981173, + "grad_norm": 1.4955836534500122, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8605550527572632, + "num_tokens": 110741417.0, + "step": 2896 + }, + { + "epoch": 0.3685281770767078, + "grad_norm": 1.58711576461792, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8525415062904358, + "num_tokens": 110776269.0, + "step": 2897 + }, + { + "epoch": 0.3686553873552983, + "grad_norm": 1.4687600135803223, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8418207168579102, + "num_tokens": 110819740.0, + "step": 2898 + }, + { + "epoch": 0.3687825976338888, + "grad_norm": 1.5793343782424927, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8567327260971069, + "num_tokens": 110856989.0, + "step": 2899 + }, + { + "epoch": 0.36890980791247935, + "grad_norm": 1.5211570262908936, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8689431548118591, + "num_tokens": 110894934.0, + "step": 2900 + }, + { + "epoch": 0.3690370181910698, + "grad_norm": 1.4836920499801636, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8681512475013733, + "num_tokens": 110933728.0, + "step": 2901 + }, + { + "epoch": 0.36916422846966035, + "grad_norm": 1.431142807006836, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8546525239944458, + "num_tokens": 110972435.0, + "step": 2902 + }, + { + "epoch": 0.3692914387482509, + "grad_norm": 1.4195550680160522, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8651057481765747, + "num_tokens": 111012241.0, + "step": 2903 + }, + { + "epoch": 0.36941864902684135, + "grad_norm": 1.4599789381027222, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8606600165367126, + "num_tokens": 111052140.0, + "step": 2904 + }, + { + "epoch": 0.3695458593054319, + "grad_norm": 1.6662250757217407, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.865243673324585, + "num_tokens": 111083654.0, + "step": 2905 + }, + { + "epoch": 0.3696730695840224, + "grad_norm": 1.5879058837890625, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8447067737579346, + "num_tokens": 111122573.0, + "step": 2906 + }, + { + "epoch": 0.3698002798626129, + "grad_norm": 1.5757969617843628, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8742200136184692, + "num_tokens": 111157230.0, + "step": 2907 + }, + { + "epoch": 0.3699274901412034, + "grad_norm": 1.6003400087356567, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8536111116409302, + "num_tokens": 111192806.0, + "step": 2908 + }, + { + "epoch": 0.37005470041979394, + "grad_norm": 1.4769951105117798, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8641382455825806, + "num_tokens": 111231074.0, + "step": 2909 + }, + { + "epoch": 0.3701819106983844, + "grad_norm": 1.6690820455551147, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8494147062301636, + "num_tokens": 111267329.0, + "step": 2910 + }, + { + "epoch": 0.37030912097697494, + "grad_norm": 1.5710166692733765, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.860869288444519, + "num_tokens": 111307285.0, + "step": 2911 + }, + { + "epoch": 0.37043633125556547, + "grad_norm": 1.642922043800354, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8410326242446899, + "num_tokens": 111344141.0, + "step": 2912 + }, + { + "epoch": 0.37056354153415594, + "grad_norm": 1.46224844455719, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8575690984725952, + "num_tokens": 111384965.0, + "step": 2913 + }, + { + "epoch": 0.37069075181274647, + "grad_norm": 1.6689084768295288, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.862207293510437, + "num_tokens": 111418620.0, + "step": 2914 + }, + { + "epoch": 0.370817962091337, + "grad_norm": 1.4850845336914062, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8770244121551514, + "num_tokens": 111455131.0, + "step": 2915 + }, + { + "epoch": 0.37094517236992747, + "grad_norm": 1.469036340713501, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8626341223716736, + "num_tokens": 111495832.0, + "step": 2916 + }, + { + "epoch": 0.371072382648518, + "grad_norm": 1.4104007482528687, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8485035300254822, + "num_tokens": 111540416.0, + "step": 2917 + }, + { + "epoch": 0.3711995929271085, + "grad_norm": 1.5674095153808594, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8585799932479858, + "num_tokens": 111575403.0, + "step": 2918 + }, + { + "epoch": 0.371326803205699, + "grad_norm": 1.5294171571731567, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8610022068023682, + "num_tokens": 111613327.0, + "step": 2919 + }, + { + "epoch": 0.3714540134842895, + "grad_norm": 1.53976309299469, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8591189384460449, + "num_tokens": 111652136.0, + "step": 2920 + }, + { + "epoch": 0.37158122376288005, + "grad_norm": 1.5524792671203613, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8725808262825012, + "num_tokens": 111686115.0, + "step": 2921 + }, + { + "epoch": 0.3717084340414705, + "grad_norm": 1.507941484451294, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8671882152557373, + "num_tokens": 111724982.0, + "step": 2922 + }, + { + "epoch": 0.37183564432006105, + "grad_norm": 1.5102189779281616, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.858814537525177, + "num_tokens": 111762551.0, + "step": 2923 + }, + { + "epoch": 0.3719628545986516, + "grad_norm": 1.5177258253097534, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8627611398696899, + "num_tokens": 111804041.0, + "step": 2924 + }, + { + "epoch": 0.37209006487724205, + "grad_norm": 1.5869877338409424, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.859345555305481, + "num_tokens": 111843379.0, + "step": 2925 + }, + { + "epoch": 0.3722172751558326, + "grad_norm": 1.5995489358901978, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8743146657943726, + "num_tokens": 111878430.0, + "step": 2926 + }, + { + "epoch": 0.3723444854344231, + "grad_norm": 1.4417393207550049, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8619397282600403, + "num_tokens": 111919274.0, + "step": 2927 + }, + { + "epoch": 0.3724716957130136, + "grad_norm": 1.6602730751037598, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8636413812637329, + "num_tokens": 111954865.0, + "step": 2928 + }, + { + "epoch": 0.3725989059916041, + "grad_norm": 1.5778604745864868, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8665204048156738, + "num_tokens": 111988802.0, + "step": 2929 + }, + { + "epoch": 0.37272611627019464, + "grad_norm": 1.5033613443374634, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8593748807907104, + "num_tokens": 112028268.0, + "step": 2930 + }, + { + "epoch": 0.37285332654878517, + "grad_norm": 1.4896645545959473, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8608423471450806, + "num_tokens": 112065951.0, + "step": 2931 + }, + { + "epoch": 0.37298053682737564, + "grad_norm": 1.5332002639770508, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8542545437812805, + "num_tokens": 112103390.0, + "step": 2932 + }, + { + "epoch": 0.37310774710596617, + "grad_norm": 1.5407090187072754, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8722080588340759, + "num_tokens": 112139518.0, + "step": 2933 + }, + { + "epoch": 0.3732349573845567, + "grad_norm": 1.515974760055542, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8754812479019165, + "num_tokens": 112173799.0, + "step": 2934 + }, + { + "epoch": 0.37336216766314717, + "grad_norm": 1.4379262924194336, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8674325942993164, + "num_tokens": 112214464.0, + "step": 2935 + }, + { + "epoch": 0.3734893779417377, + "grad_norm": 1.433596134185791, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8630475997924805, + "num_tokens": 112259543.0, + "step": 2936 + }, + { + "epoch": 0.3736165882203282, + "grad_norm": 1.464179277420044, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8648282289505005, + "num_tokens": 112301321.0, + "step": 2937 + }, + { + "epoch": 0.3737437984989187, + "grad_norm": 1.5512672662734985, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8457521796226501, + "num_tokens": 112340928.0, + "step": 2938 + }, + { + "epoch": 0.37387100877750923, + "grad_norm": 1.59808349609375, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8633414506912231, + "num_tokens": 112375643.0, + "step": 2939 + }, + { + "epoch": 0.37399821905609976, + "grad_norm": 1.6216399669647217, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.854783296585083, + "num_tokens": 112413155.0, + "step": 2940 + }, + { + "epoch": 0.37412542933469023, + "grad_norm": 1.5437148809432983, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8598601818084717, + "num_tokens": 112452962.0, + "step": 2941 + }, + { + "epoch": 0.37425263961328076, + "grad_norm": 1.5888752937316895, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.858733594417572, + "num_tokens": 112490938.0, + "step": 2942 + }, + { + "epoch": 0.3743798498918713, + "grad_norm": 1.6153963804244995, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8388804793357849, + "num_tokens": 112529045.0, + "step": 2943 + }, + { + "epoch": 0.37450706017046176, + "grad_norm": 1.550009846687317, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8609231114387512, + "num_tokens": 112569318.0, + "step": 2944 + }, + { + "epoch": 0.3746342704490523, + "grad_norm": 1.5832687616348267, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8679026365280151, + "num_tokens": 112603009.0, + "step": 2945 + }, + { + "epoch": 0.3747614807276428, + "grad_norm": 1.5691605806350708, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8571271896362305, + "num_tokens": 112639464.0, + "step": 2946 + }, + { + "epoch": 0.3748886910062333, + "grad_norm": 1.4534143209457397, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8696815371513367, + "num_tokens": 112680877.0, + "step": 2947 + }, + { + "epoch": 0.3750159012848238, + "grad_norm": 1.6399400234222412, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8668039441108704, + "num_tokens": 112712503.0, + "step": 2948 + }, + { + "epoch": 0.37514311156341434, + "grad_norm": 1.5907235145568848, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8640369176864624, + "num_tokens": 112746302.0, + "step": 2949 + }, + { + "epoch": 0.3752703218420048, + "grad_norm": 1.4578471183776855, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8589352369308472, + "num_tokens": 112790083.0, + "step": 2950 + }, + { + "epoch": 0.37539753212059535, + "grad_norm": 1.4613685607910156, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8744572401046753, + "num_tokens": 112826388.0, + "step": 2951 + }, + { + "epoch": 0.3755247423991859, + "grad_norm": 1.4568828344345093, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8549219369888306, + "num_tokens": 112867245.0, + "step": 2952 + }, + { + "epoch": 0.37565195267777635, + "grad_norm": 1.477142095565796, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8574765920639038, + "num_tokens": 112913164.0, + "step": 2953 + }, + { + "epoch": 0.3757791629563669, + "grad_norm": 1.43072509765625, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8826651573181152, + "num_tokens": 112950572.0, + "step": 2954 + }, + { + "epoch": 0.3759063732349574, + "grad_norm": 1.6273142099380493, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8632246255874634, + "num_tokens": 112984883.0, + "step": 2955 + }, + { + "epoch": 0.3760335835135479, + "grad_norm": 1.4812421798706055, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8658110499382019, + "num_tokens": 113022995.0, + "step": 2956 + }, + { + "epoch": 0.3761607937921384, + "grad_norm": 2.126871109008789, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8605155944824219, + "num_tokens": 113063840.0, + "step": 2957 + }, + { + "epoch": 0.37628800407072893, + "grad_norm": 1.6903443336486816, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8475804924964905, + "num_tokens": 113102358.0, + "step": 2958 + }, + { + "epoch": 0.3764152143493194, + "grad_norm": 1.6881141662597656, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8463011384010315, + "num_tokens": 113142117.0, + "step": 2959 + }, + { + "epoch": 0.37654242462790993, + "grad_norm": 1.6477171182632446, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8512742519378662, + "num_tokens": 113181143.0, + "step": 2960 + }, + { + "epoch": 0.37666963490650046, + "grad_norm": 1.6344074010849, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.856025755405426, + "num_tokens": 113215344.0, + "step": 2961 + }, + { + "epoch": 0.37679684518509093, + "grad_norm": 1.543521523475647, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8536449670791626, + "num_tokens": 113261471.0, + "step": 2962 + }, + { + "epoch": 0.37692405546368146, + "grad_norm": 1.5648832321166992, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8630124926567078, + "num_tokens": 113300517.0, + "step": 2963 + }, + { + "epoch": 0.377051265742272, + "grad_norm": 1.563531756401062, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8540511727333069, + "num_tokens": 113340915.0, + "step": 2964 + }, + { + "epoch": 0.37717847602086246, + "grad_norm": 1.4391859769821167, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8593719601631165, + "num_tokens": 113384308.0, + "step": 2965 + }, + { + "epoch": 0.377305686299453, + "grad_norm": 1.719619631767273, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8416155576705933, + "num_tokens": 113421059.0, + "step": 2966 + }, + { + "epoch": 0.3774328965780435, + "grad_norm": 1.7600460052490234, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.865307629108429, + "num_tokens": 113458024.0, + "step": 2967 + }, + { + "epoch": 0.377560106856634, + "grad_norm": 1.5501940250396729, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8797170519828796, + "num_tokens": 113495761.0, + "step": 2968 + }, + { + "epoch": 0.3776873171352245, + "grad_norm": 1.4318513870239258, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8580988645553589, + "num_tokens": 113538372.0, + "step": 2969 + }, + { + "epoch": 0.37781452741381505, + "grad_norm": 1.6200920343399048, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8687769174575806, + "num_tokens": 113572100.0, + "step": 2970 + }, + { + "epoch": 0.3779417376924055, + "grad_norm": 1.6515270471572876, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8463633060455322, + "num_tokens": 113606244.0, + "step": 2971 + }, + { + "epoch": 0.37806894797099605, + "grad_norm": 1.515579342842102, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8716352581977844, + "num_tokens": 113642608.0, + "step": 2972 + }, + { + "epoch": 0.3781961582495866, + "grad_norm": 1.5889713764190674, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.855972409248352, + "num_tokens": 113678479.0, + "step": 2973 + }, + { + "epoch": 0.37832336852817705, + "grad_norm": 1.7549517154693604, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8343617916107178, + "num_tokens": 113711836.0, + "step": 2974 + }, + { + "epoch": 0.3784505788067676, + "grad_norm": 1.6864800453186035, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8435705900192261, + "num_tokens": 113748233.0, + "step": 2975 + }, + { + "epoch": 0.3785777890853581, + "grad_norm": 1.5802619457244873, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8529605269432068, + "num_tokens": 113787735.0, + "step": 2976 + }, + { + "epoch": 0.3787049993639486, + "grad_norm": 1.4670490026474, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8660333156585693, + "num_tokens": 113827518.0, + "step": 2977 + }, + { + "epoch": 0.3788322096425391, + "grad_norm": 1.4718149900436401, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8677897453308105, + "num_tokens": 113869379.0, + "step": 2978 + }, + { + "epoch": 0.37895941992112964, + "grad_norm": 1.58015775680542, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8591673970222473, + "num_tokens": 113904586.0, + "step": 2979 + }, + { + "epoch": 0.37908663019972016, + "grad_norm": 1.4179134368896484, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.880293071269989, + "num_tokens": 113944512.0, + "step": 2980 + }, + { + "epoch": 0.37921384047831064, + "grad_norm": 1.6373026371002197, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8649412393569946, + "num_tokens": 113979674.0, + "step": 2981 + }, + { + "epoch": 0.37934105075690117, + "grad_norm": 1.6878620386123657, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8702612519264221, + "num_tokens": 114011936.0, + "step": 2982 + }, + { + "epoch": 0.3794682610354917, + "grad_norm": 1.6275427341461182, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8553038835525513, + "num_tokens": 114045553.0, + "step": 2983 + }, + { + "epoch": 0.37959547131408217, + "grad_norm": 1.6544383764266968, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8504194021224976, + "num_tokens": 114077529.0, + "step": 2984 + }, + { + "epoch": 0.3797226815926727, + "grad_norm": 1.4376459121704102, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8679229617118835, + "num_tokens": 114116190.0, + "step": 2985 + }, + { + "epoch": 0.3798498918712632, + "grad_norm": 1.5223939418792725, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8559751510620117, + "num_tokens": 114154835.0, + "step": 2986 + }, + { + "epoch": 0.3799771021498537, + "grad_norm": 1.5810223817825317, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8481429219245911, + "num_tokens": 114194161.0, + "step": 2987 + }, + { + "epoch": 0.3801043124284442, + "grad_norm": 1.8370791673660278, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8481941223144531, + "num_tokens": 114222639.0, + "step": 2988 + }, + { + "epoch": 0.38023152270703475, + "grad_norm": 1.3985412120819092, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.865286648273468, + "num_tokens": 114268010.0, + "step": 2989 + }, + { + "epoch": 0.3803587329856252, + "grad_norm": 1.444414734840393, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.849557638168335, + "num_tokens": 114308326.0, + "step": 2990 + }, + { + "epoch": 0.38048594326421575, + "grad_norm": 1.5472846031188965, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8636316657066345, + "num_tokens": 114343653.0, + "step": 2991 + }, + { + "epoch": 0.3806131535428063, + "grad_norm": 1.6035749912261963, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8567608594894409, + "num_tokens": 114378907.0, + "step": 2992 + }, + { + "epoch": 0.38074036382139675, + "grad_norm": 1.4554699659347534, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8577226996421814, + "num_tokens": 114415185.0, + "step": 2993 + }, + { + "epoch": 0.3808675740999873, + "grad_norm": 1.6984764337539673, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8323498964309692, + "num_tokens": 114452469.0, + "step": 2994 + }, + { + "epoch": 0.3809947843785778, + "grad_norm": 1.4261881113052368, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8610358238220215, + "num_tokens": 114493710.0, + "step": 2995 + }, + { + "epoch": 0.3811219946571683, + "grad_norm": 1.454654574394226, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8619300127029419, + "num_tokens": 114535619.0, + "step": 2996 + }, + { + "epoch": 0.3812492049357588, + "grad_norm": 1.4131977558135986, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8657384514808655, + "num_tokens": 114578863.0, + "step": 2997 + }, + { + "epoch": 0.38137641521434934, + "grad_norm": 1.4081562757492065, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8704873919487, + "num_tokens": 114617800.0, + "step": 2998 + }, + { + "epoch": 0.3815036254929398, + "grad_norm": 1.5657745599746704, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8428516387939453, + "num_tokens": 114659106.0, + "step": 2999 + }, + { + "epoch": 0.38163083577153034, + "grad_norm": 1.5401585102081299, + "learning_rate": 1e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8229647874832153, + "num_tokens": 114700159.0, + "step": 3000 + }, + { + "epoch": 0.38175804605012087, + "grad_norm": 1.645377516746521, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8685532808303833, + "num_tokens": 114732181.0, + "step": 3001 + }, + { + "epoch": 0.38188525632871134, + "grad_norm": 1.5844534635543823, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8697880506515503, + "num_tokens": 114764967.0, + "step": 3002 + }, + { + "epoch": 0.38201246660730187, + "grad_norm": 1.6374105215072632, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8608062863349915, + "num_tokens": 114805730.0, + "step": 3003 + }, + { + "epoch": 0.3821396768858924, + "grad_norm": 1.598702311515808, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8658738136291504, + "num_tokens": 114845554.0, + "step": 3004 + }, + { + "epoch": 0.38226688716448287, + "grad_norm": 1.5794241428375244, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8614898324012756, + "num_tokens": 114880824.0, + "step": 3005 + }, + { + "epoch": 0.3823940974430734, + "grad_norm": 1.6566303968429565, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.863865315914154, + "num_tokens": 114913054.0, + "step": 3006 + }, + { + "epoch": 0.38252130772166393, + "grad_norm": 1.4226495027542114, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8645151853561401, + "num_tokens": 114954629.0, + "step": 3007 + }, + { + "epoch": 0.3826485180002544, + "grad_norm": 1.605038046836853, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8470301628112793, + "num_tokens": 114991012.0, + "step": 3008 + }, + { + "epoch": 0.38277572827884493, + "grad_norm": 1.461559772491455, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8660351634025574, + "num_tokens": 115030145.0, + "step": 3009 + }, + { + "epoch": 0.38290293855743546, + "grad_norm": 1.5082159042358398, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8585144281387329, + "num_tokens": 115071862.0, + "step": 3010 + }, + { + "epoch": 0.38303014883602593, + "grad_norm": 1.3532724380493164, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8856362700462341, + "num_tokens": 115110722.0, + "step": 3011 + }, + { + "epoch": 0.38315735911461646, + "grad_norm": 1.447126865386963, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8494637608528137, + "num_tokens": 115156607.0, + "step": 3012 + }, + { + "epoch": 0.383284569393207, + "grad_norm": 1.5825814008712769, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8576575517654419, + "num_tokens": 115191190.0, + "step": 3013 + }, + { + "epoch": 0.38341177967179746, + "grad_norm": 1.4400932788848877, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8554846048355103, + "num_tokens": 115238043.0, + "step": 3014 + }, + { + "epoch": 0.383538989950388, + "grad_norm": 1.4547895193099976, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8712359666824341, + "num_tokens": 115277815.0, + "step": 3015 + }, + { + "epoch": 0.3836662002289785, + "grad_norm": 1.4511815309524536, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8617851734161377, + "num_tokens": 115317506.0, + "step": 3016 + }, + { + "epoch": 0.383793410507569, + "grad_norm": 1.8331027030944824, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8580970764160156, + "num_tokens": 115347468.0, + "step": 3017 + }, + { + "epoch": 0.3839206207861595, + "grad_norm": 1.500631332397461, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8687038421630859, + "num_tokens": 115383281.0, + "step": 3018 + }, + { + "epoch": 0.38404783106475004, + "grad_norm": 1.3873775005340576, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.863808810710907, + "num_tokens": 115426374.0, + "step": 3019 + }, + { + "epoch": 0.3841750413433405, + "grad_norm": 1.3535963296890259, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8713947534561157, + "num_tokens": 115469091.0, + "step": 3020 + }, + { + "epoch": 0.38430225162193105, + "grad_norm": 1.4997159242630005, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8722271919250488, + "num_tokens": 115506823.0, + "step": 3021 + }, + { + "epoch": 0.3844294619005216, + "grad_norm": 1.4709608554840088, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8731822967529297, + "num_tokens": 115542364.0, + "step": 3022 + }, + { + "epoch": 0.38455667217911205, + "grad_norm": 1.4339110851287842, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8582629561424255, + "num_tokens": 115580880.0, + "step": 3023 + }, + { + "epoch": 0.3846838824577026, + "grad_norm": 1.4296376705169678, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8744234442710876, + "num_tokens": 115619347.0, + "step": 3024 + }, + { + "epoch": 0.3848110927362931, + "grad_norm": 1.5353809595108032, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8609377145767212, + "num_tokens": 115659968.0, + "step": 3025 + }, + { + "epoch": 0.3849383030148836, + "grad_norm": 1.4607938528060913, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8585917353630066, + "num_tokens": 115698713.0, + "step": 3026 + }, + { + "epoch": 0.3850655132934741, + "grad_norm": 1.5260586738586426, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8637287020683289, + "num_tokens": 115738322.0, + "step": 3027 + }, + { + "epoch": 0.38519272357206463, + "grad_norm": 1.5342578887939453, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.847698450088501, + "num_tokens": 115777119.0, + "step": 3028 + }, + { + "epoch": 0.3853199338506551, + "grad_norm": 1.5428361892700195, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8751341104507446, + "num_tokens": 115812648.0, + "step": 3029 + }, + { + "epoch": 0.38544714412924563, + "grad_norm": 1.5881065130233765, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8320709466934204, + "num_tokens": 115850687.0, + "step": 3030 + }, + { + "epoch": 0.38557435440783616, + "grad_norm": 1.4680267572402954, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8629154562950134, + "num_tokens": 115892078.0, + "step": 3031 + }, + { + "epoch": 0.3857015646864267, + "grad_norm": 1.4225118160247803, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8766994476318359, + "num_tokens": 115931216.0, + "step": 3032 + }, + { + "epoch": 0.38582877496501716, + "grad_norm": 1.446399211883545, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8551011681556702, + "num_tokens": 115974078.0, + "step": 3033 + }, + { + "epoch": 0.3859559852436077, + "grad_norm": 1.5264846086502075, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8637951612472534, + "num_tokens": 116010530.0, + "step": 3034 + }, + { + "epoch": 0.3860831955221982, + "grad_norm": 1.6690864562988281, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8423186540603638, + "num_tokens": 116048887.0, + "step": 3035 + }, + { + "epoch": 0.3862104058007887, + "grad_norm": 1.4256846904754639, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8706098794937134, + "num_tokens": 116090757.0, + "step": 3036 + }, + { + "epoch": 0.3863376160793792, + "grad_norm": 1.5323340892791748, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.87059086561203, + "num_tokens": 116128090.0, + "step": 3037 + }, + { + "epoch": 0.38646482635796975, + "grad_norm": 1.4535316228866577, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8493421673774719, + "num_tokens": 116169362.0, + "step": 3038 + }, + { + "epoch": 0.3865920366365602, + "grad_norm": 1.6787998676300049, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8517622351646423, + "num_tokens": 116200776.0, + "step": 3039 + }, + { + "epoch": 0.38671924691515075, + "grad_norm": 1.6144639253616333, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8683712482452393, + "num_tokens": 116236273.0, + "step": 3040 + }, + { + "epoch": 0.3868464571937413, + "grad_norm": 1.629353642463684, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8534141778945923, + "num_tokens": 116268942.0, + "step": 3041 + }, + { + "epoch": 0.38697366747233175, + "grad_norm": 1.727786660194397, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8481466770172119, + "num_tokens": 116304904.0, + "step": 3042 + }, + { + "epoch": 0.3871008777509223, + "grad_norm": 1.5822482109069824, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8648136854171753, + "num_tokens": 116341723.0, + "step": 3043 + }, + { + "epoch": 0.3872280880295128, + "grad_norm": 1.5818363428115845, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8566056489944458, + "num_tokens": 116377371.0, + "step": 3044 + }, + { + "epoch": 0.3873552983081033, + "grad_norm": 1.5379525423049927, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8541609048843384, + "num_tokens": 116418901.0, + "step": 3045 + }, + { + "epoch": 0.3874825085866938, + "grad_norm": 1.5802842378616333, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8589504361152649, + "num_tokens": 116456292.0, + "step": 3046 + }, + { + "epoch": 0.38760971886528434, + "grad_norm": 1.592484712600708, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.856760561466217, + "num_tokens": 116492683.0, + "step": 3047 + }, + { + "epoch": 0.3877369291438748, + "grad_norm": 1.5672816038131714, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8604863882064819, + "num_tokens": 116526672.0, + "step": 3048 + }, + { + "epoch": 0.38786413942246534, + "grad_norm": 1.6636157035827637, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8421940803527832, + "num_tokens": 116559357.0, + "step": 3049 + }, + { + "epoch": 0.38799134970105587, + "grad_norm": 1.5615640878677368, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8462042808532715, + "num_tokens": 116598605.0, + "step": 3050 + }, + { + "epoch": 0.38811855997964634, + "grad_norm": 1.7255674600601196, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8481609225273132, + "num_tokens": 116634218.0, + "step": 3051 + }, + { + "epoch": 0.38824577025823687, + "grad_norm": 1.3964325189590454, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8605560660362244, + "num_tokens": 116675591.0, + "step": 3052 + }, + { + "epoch": 0.3883729805368274, + "grad_norm": 1.5199350118637085, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8598889112472534, + "num_tokens": 116711390.0, + "step": 3053 + }, + { + "epoch": 0.38850019081541787, + "grad_norm": 1.6645772457122803, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8680994510650635, + "num_tokens": 116744271.0, + "step": 3054 + }, + { + "epoch": 0.3886274010940084, + "grad_norm": 1.522955298423767, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8609887957572937, + "num_tokens": 116782318.0, + "step": 3055 + }, + { + "epoch": 0.3887546113725989, + "grad_norm": 1.5590240955352783, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8400532603263855, + "num_tokens": 116823960.0, + "step": 3056 + }, + { + "epoch": 0.3888818216511894, + "grad_norm": 1.56792414188385, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8615315556526184, + "num_tokens": 116858676.0, + "step": 3057 + }, + { + "epoch": 0.3890090319297799, + "grad_norm": 1.544198989868164, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8622866272926331, + "num_tokens": 116893197.0, + "step": 3058 + }, + { + "epoch": 0.38913624220837045, + "grad_norm": 1.2947498559951782, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8735906481742859, + "num_tokens": 116935849.0, + "step": 3059 + }, + { + "epoch": 0.3892634524869609, + "grad_norm": 1.5202689170837402, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8584024906158447, + "num_tokens": 116974097.0, + "step": 3060 + }, + { + "epoch": 0.38939066276555145, + "grad_norm": 1.5715206861495972, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8555487394332886, + "num_tokens": 117010674.0, + "step": 3061 + }, + { + "epoch": 0.389517873044142, + "grad_norm": 1.4831202030181885, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8709661364555359, + "num_tokens": 117045695.0, + "step": 3062 + }, + { + "epoch": 0.38964508332273246, + "grad_norm": 1.506377935409546, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8687688112258911, + "num_tokens": 117081893.0, + "step": 3063 + }, + { + "epoch": 0.389772293601323, + "grad_norm": 1.4565932750701904, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8640177845954895, + "num_tokens": 117122593.0, + "step": 3064 + }, + { + "epoch": 0.3898995038799135, + "grad_norm": 1.557041883468628, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8706387281417847, + "num_tokens": 117156211.0, + "step": 3065 + }, + { + "epoch": 0.390026714158504, + "grad_norm": 1.4834567308425903, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8606406450271606, + "num_tokens": 117198110.0, + "step": 3066 + }, + { + "epoch": 0.3901539244370945, + "grad_norm": 1.8784714937210083, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8732891082763672, + "num_tokens": 117234728.0, + "step": 3067 + }, + { + "epoch": 0.39028113471568504, + "grad_norm": 1.5267109870910645, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8593482375144958, + "num_tokens": 117273883.0, + "step": 3068 + }, + { + "epoch": 0.3904083449942755, + "grad_norm": 1.6805261373519897, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8432028293609619, + "num_tokens": 117308738.0, + "step": 3069 + }, + { + "epoch": 0.39053555527286604, + "grad_norm": 1.5562814474105835, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.866195797920227, + "num_tokens": 117345058.0, + "step": 3070 + }, + { + "epoch": 0.39066276555145657, + "grad_norm": 1.5352174043655396, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8503223657608032, + "num_tokens": 117383980.0, + "step": 3071 + }, + { + "epoch": 0.39078997583004704, + "grad_norm": 1.5406688451766968, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8595582842826843, + "num_tokens": 117422006.0, + "step": 3072 + }, + { + "epoch": 0.39091718610863757, + "grad_norm": 1.491467833518982, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8570680618286133, + "num_tokens": 117460072.0, + "step": 3073 + }, + { + "epoch": 0.3910443963872281, + "grad_norm": 1.5723903179168701, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.861905038356781, + "num_tokens": 117495638.0, + "step": 3074 + }, + { + "epoch": 0.39117160666581857, + "grad_norm": 1.6303802728652954, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.83427494764328, + "num_tokens": 117531493.0, + "step": 3075 + }, + { + "epoch": 0.3912988169444091, + "grad_norm": 1.520293116569519, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8525331020355225, + "num_tokens": 117572790.0, + "step": 3076 + }, + { + "epoch": 0.39142602722299963, + "grad_norm": 1.487011194229126, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8600481748580933, + "num_tokens": 117613136.0, + "step": 3077 + }, + { + "epoch": 0.3915532375015901, + "grad_norm": 1.4671684503555298, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8612993955612183, + "num_tokens": 117655490.0, + "step": 3078 + }, + { + "epoch": 0.39168044778018063, + "grad_norm": 1.6569592952728271, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8479816913604736, + "num_tokens": 117690183.0, + "step": 3079 + }, + { + "epoch": 0.39180765805877116, + "grad_norm": 1.4941294193267822, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8602075576782227, + "num_tokens": 117725440.0, + "step": 3080 + }, + { + "epoch": 0.3919348683373617, + "grad_norm": 1.5119514465332031, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8544323444366455, + "num_tokens": 117763849.0, + "step": 3081 + }, + { + "epoch": 0.39206207861595216, + "grad_norm": 1.6262959241867065, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8410353660583496, + "num_tokens": 117804695.0, + "step": 3082 + }, + { + "epoch": 0.3921892888945427, + "grad_norm": 1.5443347692489624, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.847608208656311, + "num_tokens": 117845866.0, + "step": 3083 + }, + { + "epoch": 0.3923164991731332, + "grad_norm": 1.5108753442764282, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8498917818069458, + "num_tokens": 117885528.0, + "step": 3084 + }, + { + "epoch": 0.3924437094517237, + "grad_norm": 1.503368854522705, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8492262363433838, + "num_tokens": 117924377.0, + "step": 3085 + }, + { + "epoch": 0.3925709197303142, + "grad_norm": 1.6161104440689087, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.865088701248169, + "num_tokens": 117959378.0, + "step": 3086 + }, + { + "epoch": 0.39269813000890474, + "grad_norm": 1.4253196716308594, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8525828123092651, + "num_tokens": 118003331.0, + "step": 3087 + }, + { + "epoch": 0.3928253402874952, + "grad_norm": 1.4923888444900513, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8565952777862549, + "num_tokens": 118044534.0, + "step": 3088 + }, + { + "epoch": 0.39295255056608575, + "grad_norm": 1.5953752994537354, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8604127764701843, + "num_tokens": 118080513.0, + "step": 3089 + }, + { + "epoch": 0.3930797608446763, + "grad_norm": 1.4676947593688965, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8533418774604797, + "num_tokens": 118123042.0, + "step": 3090 + }, + { + "epoch": 0.39320697112326675, + "grad_norm": 1.4910606145858765, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.861738920211792, + "num_tokens": 118162262.0, + "step": 3091 + }, + { + "epoch": 0.3933341814018573, + "grad_norm": 1.3313699960708618, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8581901788711548, + "num_tokens": 118208661.0, + "step": 3092 + }, + { + "epoch": 0.3934613916804478, + "grad_norm": 1.5236170291900635, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8509721159934998, + "num_tokens": 118250858.0, + "step": 3093 + }, + { + "epoch": 0.3935886019590383, + "grad_norm": 1.4516621828079224, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8816835880279541, + "num_tokens": 118285163.0, + "step": 3094 + }, + { + "epoch": 0.3937158122376288, + "grad_norm": 1.4718598127365112, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.862026572227478, + "num_tokens": 118323568.0, + "step": 3095 + }, + { + "epoch": 0.39384302251621933, + "grad_norm": 1.5441638231277466, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.868381142616272, + "num_tokens": 118358835.0, + "step": 3096 + }, + { + "epoch": 0.3939702327948098, + "grad_norm": 1.4318300485610962, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8645090460777283, + "num_tokens": 118397872.0, + "step": 3097 + }, + { + "epoch": 0.39409744307340033, + "grad_norm": 1.4726686477661133, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8677663207054138, + "num_tokens": 118437805.0, + "step": 3098 + }, + { + "epoch": 0.39422465335199086, + "grad_norm": 1.540950894355774, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8613144755363464, + "num_tokens": 118478478.0, + "step": 3099 + }, + { + "epoch": 0.39435186363058133, + "grad_norm": 1.502678632736206, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.856308102607727, + "num_tokens": 118517688.0, + "step": 3100 + }, + { + "epoch": 0.39447907390917186, + "grad_norm": 1.453051209449768, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8492029309272766, + "num_tokens": 118560055.0, + "step": 3101 + }, + { + "epoch": 0.3946062841877624, + "grad_norm": 1.400601863861084, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.860804557800293, + "num_tokens": 118604703.0, + "step": 3102 + }, + { + "epoch": 0.39473349446635286, + "grad_norm": 1.741558313369751, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8542321920394897, + "num_tokens": 118638347.0, + "step": 3103 + }, + { + "epoch": 0.3948607047449434, + "grad_norm": 1.5798122882843018, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8576951026916504, + "num_tokens": 118676290.0, + "step": 3104 + }, + { + "epoch": 0.3949879150235339, + "grad_norm": 1.5897715091705322, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8593230843544006, + "num_tokens": 118710251.0, + "step": 3105 + }, + { + "epoch": 0.3951151253021244, + "grad_norm": 1.852936863899231, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8573293685913086, + "num_tokens": 118739514.0, + "step": 3106 + }, + { + "epoch": 0.3952423355807149, + "grad_norm": 1.5244417190551758, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8449458479881287, + "num_tokens": 118777532.0, + "step": 3107 + }, + { + "epoch": 0.39536954585930545, + "grad_norm": 1.7566560506820679, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.864963173866272, + "num_tokens": 118814321.0, + "step": 3108 + }, + { + "epoch": 0.3954967561378959, + "grad_norm": 1.571215271949768, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8537054061889648, + "num_tokens": 118853071.0, + "step": 3109 + }, + { + "epoch": 0.39562396641648645, + "grad_norm": 1.6995775699615479, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8627912998199463, + "num_tokens": 118883466.0, + "step": 3110 + }, + { + "epoch": 0.395751176695077, + "grad_norm": 1.5591176748275757, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.839684009552002, + "num_tokens": 118921910.0, + "step": 3111 + }, + { + "epoch": 0.39587838697366745, + "grad_norm": 1.5707319974899292, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8757820129394531, + "num_tokens": 118957049.0, + "step": 3112 + }, + { + "epoch": 0.396005597252258, + "grad_norm": 1.5694372653961182, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8678545951843262, + "num_tokens": 118990305.0, + "step": 3113 + }, + { + "epoch": 0.3961328075308485, + "grad_norm": 1.692700982093811, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8451618552207947, + "num_tokens": 119024196.0, + "step": 3114 + }, + { + "epoch": 0.396260017809439, + "grad_norm": 1.6220617294311523, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.851155698299408, + "num_tokens": 119055962.0, + "step": 3115 + }, + { + "epoch": 0.3963872280880295, + "grad_norm": 1.5835850238800049, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8644033670425415, + "num_tokens": 119091484.0, + "step": 3116 + }, + { + "epoch": 0.39651443836662004, + "grad_norm": 1.684187650680542, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8517932891845703, + "num_tokens": 119130178.0, + "step": 3117 + }, + { + "epoch": 0.3966416486452105, + "grad_norm": 1.5706454515457153, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.851946234703064, + "num_tokens": 119166968.0, + "step": 3118 + }, + { + "epoch": 0.39676885892380104, + "grad_norm": 1.611396312713623, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8526611328125, + "num_tokens": 119202654.0, + "step": 3119 + }, + { + "epoch": 0.39689606920239157, + "grad_norm": 1.5221003293991089, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8591620922088623, + "num_tokens": 119241611.0, + "step": 3120 + }, + { + "epoch": 0.39702327948098204, + "grad_norm": 1.4456110000610352, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8624845147132874, + "num_tokens": 119284635.0, + "step": 3121 + }, + { + "epoch": 0.39715048975957257, + "grad_norm": 1.6183366775512695, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8668452501296997, + "num_tokens": 119319746.0, + "step": 3122 + }, + { + "epoch": 0.3972777000381631, + "grad_norm": 1.4602481126785278, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8760972023010254, + "num_tokens": 119357954.0, + "step": 3123 + }, + { + "epoch": 0.39740491031675357, + "grad_norm": 1.6324741840362549, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8684463500976562, + "num_tokens": 119388616.0, + "step": 3124 + }, + { + "epoch": 0.3975321205953441, + "grad_norm": 1.5247273445129395, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8714491128921509, + "num_tokens": 119426009.0, + "step": 3125 + }, + { + "epoch": 0.3976593308739346, + "grad_norm": 1.5964080095291138, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.843375027179718, + "num_tokens": 119464713.0, + "step": 3126 + }, + { + "epoch": 0.3977865411525251, + "grad_norm": 1.5471479892730713, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.859375, + "num_tokens": 119500453.0, + "step": 3127 + }, + { + "epoch": 0.3979137514311156, + "grad_norm": 1.3332222700119019, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8669552803039551, + "num_tokens": 119544067.0, + "step": 3128 + }, + { + "epoch": 0.39804096170970615, + "grad_norm": 1.6403257846832275, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8546693325042725, + "num_tokens": 119576624.0, + "step": 3129 + }, + { + "epoch": 0.3981681719882967, + "grad_norm": 1.6008113622665405, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.868695080280304, + "num_tokens": 119612441.0, + "step": 3130 + }, + { + "epoch": 0.39829538226688715, + "grad_norm": 1.6553261280059814, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8522658348083496, + "num_tokens": 119648210.0, + "step": 3131 + }, + { + "epoch": 0.3984225925454777, + "grad_norm": 1.5260002613067627, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8555232286453247, + "num_tokens": 119689474.0, + "step": 3132 + }, + { + "epoch": 0.3985498028240682, + "grad_norm": 1.4229720830917358, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8679993748664856, + "num_tokens": 119730536.0, + "step": 3133 + }, + { + "epoch": 0.3986770131026587, + "grad_norm": 1.5509681701660156, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8668414354324341, + "num_tokens": 119764139.0, + "step": 3134 + }, + { + "epoch": 0.3988042233812492, + "grad_norm": 1.5806500911712646, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8657684326171875, + "num_tokens": 119799215.0, + "step": 3135 + }, + { + "epoch": 0.39893143365983974, + "grad_norm": 1.5132932662963867, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8634865880012512, + "num_tokens": 119838185.0, + "step": 3136 + }, + { + "epoch": 0.3990586439384302, + "grad_norm": 1.5043989419937134, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8673559427261353, + "num_tokens": 119877397.0, + "step": 3137 + }, + { + "epoch": 0.39918585421702074, + "grad_norm": 1.5624257326126099, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8629779815673828, + "num_tokens": 119914952.0, + "step": 3138 + }, + { + "epoch": 0.39931306449561127, + "grad_norm": 1.761687994003296, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8561152219772339, + "num_tokens": 119946501.0, + "step": 3139 + }, + { + "epoch": 0.39944027477420174, + "grad_norm": 1.6284180879592896, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8500046730041504, + "num_tokens": 119978008.0, + "step": 3140 + }, + { + "epoch": 0.39956748505279227, + "grad_norm": 1.5526025295257568, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8637191653251648, + "num_tokens": 120014207.0, + "step": 3141 + }, + { + "epoch": 0.3996946953313828, + "grad_norm": 1.4057109355926514, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8783903121948242, + "num_tokens": 120052631.0, + "step": 3142 + }, + { + "epoch": 0.39982190560997327, + "grad_norm": 1.4075061082839966, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8767614364624023, + "num_tokens": 120092485.0, + "step": 3143 + }, + { + "epoch": 0.3999491158885638, + "grad_norm": 1.6165940761566162, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8587309122085571, + "num_tokens": 120125550.0, + "step": 3144 + }, + { + "epoch": 0.40007632616715433, + "grad_norm": 1.5482569932937622, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8395717740058899, + "num_tokens": 120165332.0, + "step": 3145 + }, + { + "epoch": 0.4002035364457448, + "grad_norm": 1.5519740581512451, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8473860025405884, + "num_tokens": 120202833.0, + "step": 3146 + }, + { + "epoch": 0.40033074672433533, + "grad_norm": 1.527958631515503, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8714306354522705, + "num_tokens": 120237420.0, + "step": 3147 + }, + { + "epoch": 0.40045795700292586, + "grad_norm": 1.6049931049346924, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8574961423873901, + "num_tokens": 120270437.0, + "step": 3148 + }, + { + "epoch": 0.40058516728151633, + "grad_norm": 1.4418619871139526, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8601231575012207, + "num_tokens": 120312233.0, + "step": 3149 + }, + { + "epoch": 0.40071237756010686, + "grad_norm": 1.538590669631958, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8748722076416016, + "num_tokens": 120345436.0, + "step": 3150 + }, + { + "epoch": 0.4008395878386974, + "grad_norm": 1.534812569618225, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8628885746002197, + "num_tokens": 120385773.0, + "step": 3151 + }, + { + "epoch": 0.40096679811728786, + "grad_norm": 1.5467456579208374, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8678691387176514, + "num_tokens": 120421409.0, + "step": 3152 + }, + { + "epoch": 0.4010940083958784, + "grad_norm": 1.5699244737625122, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.870607852935791, + "num_tokens": 120459703.0, + "step": 3153 + }, + { + "epoch": 0.4012212186744689, + "grad_norm": 1.510469675064087, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.866953432559967, + "num_tokens": 120500908.0, + "step": 3154 + }, + { + "epoch": 0.4013484289530594, + "grad_norm": 1.4150547981262207, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8759716749191284, + "num_tokens": 120541322.0, + "step": 3155 + }, + { + "epoch": 0.4014756392316499, + "grad_norm": 1.593827486038208, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8656996488571167, + "num_tokens": 120575077.0, + "step": 3156 + }, + { + "epoch": 0.40160284951024044, + "grad_norm": 1.4945533275604248, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8433241844177246, + "num_tokens": 120617291.0, + "step": 3157 + }, + { + "epoch": 0.4017300597888309, + "grad_norm": 1.6194392442703247, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8712041974067688, + "num_tokens": 120652566.0, + "step": 3158 + }, + { + "epoch": 0.40185727006742145, + "grad_norm": 1.5539768934249878, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8584749102592468, + "num_tokens": 120691209.0, + "step": 3159 + }, + { + "epoch": 0.401984480346012, + "grad_norm": 1.4792349338531494, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8608883023262024, + "num_tokens": 120729837.0, + "step": 3160 + }, + { + "epoch": 0.40211169062460245, + "grad_norm": 1.5057487487792969, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8717790842056274, + "num_tokens": 120767618.0, + "step": 3161 + }, + { + "epoch": 0.402238900903193, + "grad_norm": 1.4007983207702637, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8854336738586426, + "num_tokens": 120807253.0, + "step": 3162 + }, + { + "epoch": 0.4023661111817835, + "grad_norm": 1.59494149684906, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8655998706817627, + "num_tokens": 120842760.0, + "step": 3163 + }, + { + "epoch": 0.402493321460374, + "grad_norm": 1.6386984586715698, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8644993901252747, + "num_tokens": 120879857.0, + "step": 3164 + }, + { + "epoch": 0.4026205317389645, + "grad_norm": 1.5234079360961914, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8581743240356445, + "num_tokens": 120920081.0, + "step": 3165 + }, + { + "epoch": 0.40274774201755503, + "grad_norm": 1.4152169227600098, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8681297898292542, + "num_tokens": 120958795.0, + "step": 3166 + }, + { + "epoch": 0.4028749522961455, + "grad_norm": 1.495376706123352, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8672318458557129, + "num_tokens": 120999547.0, + "step": 3167 + }, + { + "epoch": 0.40300216257473603, + "grad_norm": 1.6942588090896606, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8664219379425049, + "num_tokens": 121030027.0, + "step": 3168 + }, + { + "epoch": 0.40312937285332656, + "grad_norm": 1.519465684890747, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8432857990264893, + "num_tokens": 121070725.0, + "step": 3169 + }, + { + "epoch": 0.40325658313191703, + "grad_norm": 1.480075716972351, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8653906583786011, + "num_tokens": 121108529.0, + "step": 3170 + }, + { + "epoch": 0.40338379341050756, + "grad_norm": 1.4512615203857422, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8698763847351074, + "num_tokens": 121147691.0, + "step": 3171 + }, + { + "epoch": 0.4035110036890981, + "grad_norm": 1.563008427619934, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8659521341323853, + "num_tokens": 121182585.0, + "step": 3172 + }, + { + "epoch": 0.40363821396768856, + "grad_norm": 1.5873138904571533, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8617837429046631, + "num_tokens": 121219394.0, + "step": 3173 + }, + { + "epoch": 0.4037654242462791, + "grad_norm": 1.3777518272399902, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.873847484588623, + "num_tokens": 121263233.0, + "step": 3174 + }, + { + "epoch": 0.4038926345248696, + "grad_norm": 1.5664836168289185, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.861249566078186, + "num_tokens": 121303909.0, + "step": 3175 + }, + { + "epoch": 0.4040198448034601, + "grad_norm": 1.7241767644882202, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8559996485710144, + "num_tokens": 121332351.0, + "step": 3176 + }, + { + "epoch": 0.4041470550820506, + "grad_norm": 1.472377061843872, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8663010001182556, + "num_tokens": 121375014.0, + "step": 3177 + }, + { + "epoch": 0.40427426536064115, + "grad_norm": 1.5735690593719482, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8569805026054382, + "num_tokens": 121410964.0, + "step": 3178 + }, + { + "epoch": 0.4044014756392316, + "grad_norm": 1.5752907991409302, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.853759765625, + "num_tokens": 121447927.0, + "step": 3179 + }, + { + "epoch": 0.40452868591782215, + "grad_norm": 1.5947604179382324, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8692004680633545, + "num_tokens": 121482055.0, + "step": 3180 + }, + { + "epoch": 0.4046558961964127, + "grad_norm": 1.5789951086044312, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8494933843612671, + "num_tokens": 121521992.0, + "step": 3181 + }, + { + "epoch": 0.4047831064750032, + "grad_norm": 1.6553248167037964, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.854072630405426, + "num_tokens": 121556273.0, + "step": 3182 + }, + { + "epoch": 0.4049103167535937, + "grad_norm": 1.6030622720718384, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8577613234519958, + "num_tokens": 121587728.0, + "step": 3183 + }, + { + "epoch": 0.4050375270321842, + "grad_norm": 1.6065289974212646, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8379608392715454, + "num_tokens": 121624907.0, + "step": 3184 + }, + { + "epoch": 0.40516473731077474, + "grad_norm": 1.5438501834869385, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8625268936157227, + "num_tokens": 121661609.0, + "step": 3185 + }, + { + "epoch": 0.4052919475893652, + "grad_norm": 1.5692580938339233, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8595503568649292, + "num_tokens": 121696597.0, + "step": 3186 + }, + { + "epoch": 0.40541915786795574, + "grad_norm": 1.4980131387710571, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8632604479789734, + "num_tokens": 121733552.0, + "step": 3187 + }, + { + "epoch": 0.40554636814654627, + "grad_norm": 1.5268245935440063, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8711968660354614, + "num_tokens": 121773165.0, + "step": 3188 + }, + { + "epoch": 0.40567357842513674, + "grad_norm": 1.526181936264038, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8566956520080566, + "num_tokens": 121818540.0, + "step": 3189 + }, + { + "epoch": 0.40580078870372727, + "grad_norm": 1.4249950647354126, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8577405214309692, + "num_tokens": 121862667.0, + "step": 3190 + }, + { + "epoch": 0.4059279989823178, + "grad_norm": 1.6010191440582275, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8441084027290344, + "num_tokens": 121900328.0, + "step": 3191 + }, + { + "epoch": 0.40605520926090827, + "grad_norm": 1.7399766445159912, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8552237749099731, + "num_tokens": 121931005.0, + "step": 3192 + }, + { + "epoch": 0.4061824195394988, + "grad_norm": 1.5322017669677734, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8592154383659363, + "num_tokens": 121971914.0, + "step": 3193 + }, + { + "epoch": 0.4063096298180893, + "grad_norm": 1.4469330310821533, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8608145713806152, + "num_tokens": 122017057.0, + "step": 3194 + }, + { + "epoch": 0.4064368400966798, + "grad_norm": 1.5436345338821411, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8770293593406677, + "num_tokens": 122055140.0, + "step": 3195 + }, + { + "epoch": 0.4065640503752703, + "grad_norm": 1.4589300155639648, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8572552800178528, + "num_tokens": 122097706.0, + "step": 3196 + }, + { + "epoch": 0.40669126065386085, + "grad_norm": 1.6355953216552734, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8703612089157104, + "num_tokens": 122128780.0, + "step": 3197 + }, + { + "epoch": 0.4068184709324513, + "grad_norm": 1.5038671493530273, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8641029000282288, + "num_tokens": 122166106.0, + "step": 3198 + }, + { + "epoch": 0.40694568121104185, + "grad_norm": 1.6855881214141846, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8571351766586304, + "num_tokens": 122205167.0, + "step": 3199 + }, + { + "epoch": 0.4070728914896324, + "grad_norm": 1.6381360292434692, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8457781076431274, + "num_tokens": 122244733.0, + "step": 3200 + }, + { + "epoch": 0.40720010176822286, + "grad_norm": 1.4751744270324707, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8545405864715576, + "num_tokens": 122285853.0, + "step": 3201 + }, + { + "epoch": 0.4073273120468134, + "grad_norm": 1.5182719230651855, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8678956031799316, + "num_tokens": 122323422.0, + "step": 3202 + }, + { + "epoch": 0.4074545223254039, + "grad_norm": 1.4953702688217163, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8631026744842529, + "num_tokens": 122358805.0, + "step": 3203 + }, + { + "epoch": 0.4075817326039944, + "grad_norm": 1.6302733421325684, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8622170090675354, + "num_tokens": 122393768.0, + "step": 3204 + }, + { + "epoch": 0.4077089428825849, + "grad_norm": 1.7504019737243652, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8671996593475342, + "num_tokens": 122423750.0, + "step": 3205 + }, + { + "epoch": 0.40783615316117544, + "grad_norm": 1.6466717720031738, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8458811044692993, + "num_tokens": 122461628.0, + "step": 3206 + }, + { + "epoch": 0.4079633634397659, + "grad_norm": 1.6260324716567993, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8631210327148438, + "num_tokens": 122496921.0, + "step": 3207 + }, + { + "epoch": 0.40809057371835644, + "grad_norm": 1.585498571395874, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8648892641067505, + "num_tokens": 122528389.0, + "step": 3208 + }, + { + "epoch": 0.40821778399694697, + "grad_norm": 1.5915923118591309, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8501971960067749, + "num_tokens": 122563460.0, + "step": 3209 + }, + { + "epoch": 0.40834499427553744, + "grad_norm": 1.5438083410263062, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8541610240936279, + "num_tokens": 122601503.0, + "step": 3210 + }, + { + "epoch": 0.40847220455412797, + "grad_norm": 1.5164719820022583, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8670071363449097, + "num_tokens": 122635464.0, + "step": 3211 + }, + { + "epoch": 0.4085994148327185, + "grad_norm": 1.4444423913955688, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8706616163253784, + "num_tokens": 122676637.0, + "step": 3212 + }, + { + "epoch": 0.40872662511130897, + "grad_norm": 1.4623427391052246, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8507533073425293, + "num_tokens": 122720546.0, + "step": 3213 + }, + { + "epoch": 0.4088538353898995, + "grad_norm": 1.5399274826049805, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8645944595336914, + "num_tokens": 122758179.0, + "step": 3214 + }, + { + "epoch": 0.40898104566849003, + "grad_norm": 1.5136207342147827, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8681313395500183, + "num_tokens": 122794918.0, + "step": 3215 + }, + { + "epoch": 0.4091082559470805, + "grad_norm": 1.6093508005142212, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8500921726226807, + "num_tokens": 122830039.0, + "step": 3216 + }, + { + "epoch": 0.40923546622567103, + "grad_norm": 1.5285886526107788, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8498748540878296, + "num_tokens": 122867358.0, + "step": 3217 + }, + { + "epoch": 0.40936267650426156, + "grad_norm": 1.5357334613800049, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8615091443061829, + "num_tokens": 122904607.0, + "step": 3218 + }, + { + "epoch": 0.40948988678285203, + "grad_norm": 1.5193307399749756, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8525192141532898, + "num_tokens": 122943114.0, + "step": 3219 + }, + { + "epoch": 0.40961709706144256, + "grad_norm": 1.4772100448608398, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8688870668411255, + "num_tokens": 122981100.0, + "step": 3220 + }, + { + "epoch": 0.4097443073400331, + "grad_norm": 1.3697854280471802, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8645389080047607, + "num_tokens": 123024450.0, + "step": 3221 + }, + { + "epoch": 0.40987151761862356, + "grad_norm": 1.4273102283477783, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8649629354476929, + "num_tokens": 123064016.0, + "step": 3222 + }, + { + "epoch": 0.4099987278972141, + "grad_norm": 1.6291680335998535, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8586550951004028, + "num_tokens": 123099606.0, + "step": 3223 + }, + { + "epoch": 0.4101259381758046, + "grad_norm": 1.5988818407058716, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8525905013084412, + "num_tokens": 123133789.0, + "step": 3224 + }, + { + "epoch": 0.4102531484543951, + "grad_norm": 1.57882559299469, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8639406561851501, + "num_tokens": 123171974.0, + "step": 3225 + }, + { + "epoch": 0.4103803587329856, + "grad_norm": 1.4106916189193726, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8848139047622681, + "num_tokens": 123205872.0, + "step": 3226 + }, + { + "epoch": 0.41050756901157615, + "grad_norm": 1.6665055751800537, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8577481508255005, + "num_tokens": 123237713.0, + "step": 3227 + }, + { + "epoch": 0.4106347792901666, + "grad_norm": 1.420185923576355, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8572297692298889, + "num_tokens": 123280825.0, + "step": 3228 + }, + { + "epoch": 0.41076198956875715, + "grad_norm": 1.6692166328430176, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8523949384689331, + "num_tokens": 123315774.0, + "step": 3229 + }, + { + "epoch": 0.4108891998473477, + "grad_norm": 1.5417264699935913, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.853693425655365, + "num_tokens": 123354914.0, + "step": 3230 + }, + { + "epoch": 0.4110164101259382, + "grad_norm": 1.3492895364761353, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8711413145065308, + "num_tokens": 123398427.0, + "step": 3231 + }, + { + "epoch": 0.4111436204045287, + "grad_norm": 1.5047800540924072, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.862908661365509, + "num_tokens": 123436238.0, + "step": 3232 + }, + { + "epoch": 0.4112708306831192, + "grad_norm": 1.4412715435028076, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8621890544891357, + "num_tokens": 123474265.0, + "step": 3233 + }, + { + "epoch": 0.41139804096170973, + "grad_norm": 1.492762565612793, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8709014058113098, + "num_tokens": 123508941.0, + "step": 3234 + }, + { + "epoch": 0.4115252512403002, + "grad_norm": 1.6495361328125, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8483665585517883, + "num_tokens": 123548471.0, + "step": 3235 + }, + { + "epoch": 0.41165246151889073, + "grad_norm": 1.6741329431533813, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8796657919883728, + "num_tokens": 123578313.0, + "step": 3236 + }, + { + "epoch": 0.41177967179748126, + "grad_norm": 1.6621426343917847, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8547359704971313, + "num_tokens": 123618041.0, + "step": 3237 + }, + { + "epoch": 0.41190688207607173, + "grad_norm": 1.6214510202407837, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8442641496658325, + "num_tokens": 123656828.0, + "step": 3238 + }, + { + "epoch": 0.41203409235466226, + "grad_norm": 1.5490189790725708, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8491729497909546, + "num_tokens": 123695441.0, + "step": 3239 + }, + { + "epoch": 0.4121613026332528, + "grad_norm": 1.608850359916687, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8651607632637024, + "num_tokens": 123730787.0, + "step": 3240 + }, + { + "epoch": 0.41228851291184326, + "grad_norm": 1.3075777292251587, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8655515909194946, + "num_tokens": 123778636.0, + "step": 3241 + }, + { + "epoch": 0.4124157231904338, + "grad_norm": 1.4781516790390015, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.853041410446167, + "num_tokens": 123820704.0, + "step": 3242 + }, + { + "epoch": 0.4125429334690243, + "grad_norm": 1.5283550024032593, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8442879915237427, + "num_tokens": 123859659.0, + "step": 3243 + }, + { + "epoch": 0.4126701437476148, + "grad_norm": 1.5527979135513306, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8673173189163208, + "num_tokens": 123894538.0, + "step": 3244 + }, + { + "epoch": 0.4127973540262053, + "grad_norm": 1.4405561685562134, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8736307621002197, + "num_tokens": 123932354.0, + "step": 3245 + }, + { + "epoch": 0.41292456430479585, + "grad_norm": 1.5596375465393066, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8701732158660889, + "num_tokens": 123970371.0, + "step": 3246 + }, + { + "epoch": 0.4130517745833863, + "grad_norm": 1.3659038543701172, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8730853796005249, + "num_tokens": 124014002.0, + "step": 3247 + }, + { + "epoch": 0.41317898486197685, + "grad_norm": 1.5226271152496338, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8679612874984741, + "num_tokens": 124049643.0, + "step": 3248 + }, + { + "epoch": 0.4133061951405674, + "grad_norm": 1.6004345417022705, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8601076602935791, + "num_tokens": 124084433.0, + "step": 3249 + }, + { + "epoch": 0.41343340541915785, + "grad_norm": 1.594200611114502, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8616102337837219, + "num_tokens": 124126052.0, + "step": 3250 + }, + { + "epoch": 0.4135606156977484, + "grad_norm": 1.5646092891693115, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8446069955825806, + "num_tokens": 124163826.0, + "step": 3251 + }, + { + "epoch": 0.4136878259763389, + "grad_norm": 1.5102347135543823, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8610273599624634, + "num_tokens": 124206402.0, + "step": 3252 + }, + { + "epoch": 0.4138150362549294, + "grad_norm": 1.4820383787155151, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8590338230133057, + "num_tokens": 124245069.0, + "step": 3253 + }, + { + "epoch": 0.4139422465335199, + "grad_norm": 1.4137675762176514, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8505754470825195, + "num_tokens": 124291482.0, + "step": 3254 + }, + { + "epoch": 0.41406945681211044, + "grad_norm": 1.4875813722610474, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8744537830352783, + "num_tokens": 124329548.0, + "step": 3255 + }, + { + "epoch": 0.4141966670907009, + "grad_norm": 1.414794683456421, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8663145303726196, + "num_tokens": 124373716.0, + "step": 3256 + }, + { + "epoch": 0.41432387736929144, + "grad_norm": 1.5226006507873535, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8594444394111633, + "num_tokens": 124413057.0, + "step": 3257 + }, + { + "epoch": 0.41445108764788197, + "grad_norm": 1.4314266443252563, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8541985154151917, + "num_tokens": 124454026.0, + "step": 3258 + }, + { + "epoch": 0.41457829792647244, + "grad_norm": 1.5130336284637451, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8701885938644409, + "num_tokens": 124490372.0, + "step": 3259 + }, + { + "epoch": 0.41470550820506297, + "grad_norm": 1.4907656908035278, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8720796704292297, + "num_tokens": 124529213.0, + "step": 3260 + }, + { + "epoch": 0.4148327184836535, + "grad_norm": 1.4835082292556763, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.862478494644165, + "num_tokens": 124567115.0, + "step": 3261 + }, + { + "epoch": 0.41495992876224397, + "grad_norm": 1.5148411989212036, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8645443916320801, + "num_tokens": 124604368.0, + "step": 3262 + }, + { + "epoch": 0.4150871390408345, + "grad_norm": 1.4334286451339722, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.870745062828064, + "num_tokens": 124648183.0, + "step": 3263 + }, + { + "epoch": 0.415214349319425, + "grad_norm": 1.6018221378326416, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8526469469070435, + "num_tokens": 124683901.0, + "step": 3264 + }, + { + "epoch": 0.4153415595980155, + "grad_norm": 1.6266560554504395, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8416595458984375, + "num_tokens": 124721612.0, + "step": 3265 + }, + { + "epoch": 0.415468769876606, + "grad_norm": 1.3935532569885254, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8663813471794128, + "num_tokens": 124765870.0, + "step": 3266 + }, + { + "epoch": 0.41559598015519655, + "grad_norm": 1.5926457643508911, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8528832197189331, + "num_tokens": 124802449.0, + "step": 3267 + }, + { + "epoch": 0.415723190433787, + "grad_norm": 1.4200565814971924, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8627910614013672, + "num_tokens": 124846573.0, + "step": 3268 + }, + { + "epoch": 0.41585040071237755, + "grad_norm": 1.6133002042770386, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8535184860229492, + "num_tokens": 124883095.0, + "step": 3269 + }, + { + "epoch": 0.4159776109909681, + "grad_norm": 1.6531518697738647, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8616544604301453, + "num_tokens": 124916076.0, + "step": 3270 + }, + { + "epoch": 0.41610482126955856, + "grad_norm": 1.5861358642578125, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8565011024475098, + "num_tokens": 124951611.0, + "step": 3271 + }, + { + "epoch": 0.4162320315481491, + "grad_norm": 1.4683407545089722, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8731441497802734, + "num_tokens": 124991868.0, + "step": 3272 + }, + { + "epoch": 0.4163592418267396, + "grad_norm": 1.5143566131591797, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.866215169429779, + "num_tokens": 125028777.0, + "step": 3273 + }, + { + "epoch": 0.4164864521053301, + "grad_norm": 1.4746609926223755, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8604342937469482, + "num_tokens": 125068873.0, + "step": 3274 + }, + { + "epoch": 0.4166136623839206, + "grad_norm": 1.6302345991134644, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8527368307113647, + "num_tokens": 125106287.0, + "step": 3275 + }, + { + "epoch": 0.41674087266251114, + "grad_norm": 1.5569162368774414, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8657885193824768, + "num_tokens": 125144166.0, + "step": 3276 + }, + { + "epoch": 0.4168680829411016, + "grad_norm": 1.478002667427063, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8749619722366333, + "num_tokens": 125180832.0, + "step": 3277 + }, + { + "epoch": 0.41699529321969214, + "grad_norm": 1.6807892322540283, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8523188233375549, + "num_tokens": 125215398.0, + "step": 3278 + }, + { + "epoch": 0.41712250349828267, + "grad_norm": 1.4891108274459839, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8702378869056702, + "num_tokens": 125256841.0, + "step": 3279 + }, + { + "epoch": 0.4172497137768732, + "grad_norm": 1.836621642112732, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8616759777069092, + "num_tokens": 125287626.0, + "step": 3280 + }, + { + "epoch": 0.41737692405546367, + "grad_norm": 1.6195491552352905, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8679265975952148, + "num_tokens": 125323860.0, + "step": 3281 + }, + { + "epoch": 0.4175041343340542, + "grad_norm": 1.5022590160369873, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8749370574951172, + "num_tokens": 125359796.0, + "step": 3282 + }, + { + "epoch": 0.41763134461264473, + "grad_norm": 1.4716740846633911, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8585587739944458, + "num_tokens": 125401184.0, + "step": 3283 + }, + { + "epoch": 0.4177585548912352, + "grad_norm": 1.4871187210083008, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8575611114501953, + "num_tokens": 125440267.0, + "step": 3284 + }, + { + "epoch": 0.41788576516982573, + "grad_norm": 1.6366310119628906, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8542173504829407, + "num_tokens": 125476819.0, + "step": 3285 + }, + { + "epoch": 0.41801297544841626, + "grad_norm": 1.495377779006958, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8556243777275085, + "num_tokens": 125516227.0, + "step": 3286 + }, + { + "epoch": 0.41814018572700673, + "grad_norm": 1.5256690979003906, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8637173771858215, + "num_tokens": 125553846.0, + "step": 3287 + }, + { + "epoch": 0.41826739600559726, + "grad_norm": 1.4252816438674927, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.860670268535614, + "num_tokens": 125595452.0, + "step": 3288 + }, + { + "epoch": 0.4183946062841878, + "grad_norm": 1.6298961639404297, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8592978715896606, + "num_tokens": 125630630.0, + "step": 3289 + }, + { + "epoch": 0.41852181656277826, + "grad_norm": 1.4934062957763672, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8609729409217834, + "num_tokens": 125669665.0, + "step": 3290 + }, + { + "epoch": 0.4186490268413688, + "grad_norm": 1.5495173931121826, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8611346483230591, + "num_tokens": 125709016.0, + "step": 3291 + }, + { + "epoch": 0.4187762371199593, + "grad_norm": 1.725250482559204, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.863327145576477, + "num_tokens": 125736995.0, + "step": 3292 + }, + { + "epoch": 0.4189034473985498, + "grad_norm": 1.4835169315338135, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8714036345481873, + "num_tokens": 125772838.0, + "step": 3293 + }, + { + "epoch": 0.4190306576771403, + "grad_norm": 1.5865211486816406, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8486696481704712, + "num_tokens": 125809523.0, + "step": 3294 + }, + { + "epoch": 0.41915786795573085, + "grad_norm": 1.5951131582260132, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8510242700576782, + "num_tokens": 125845604.0, + "step": 3295 + }, + { + "epoch": 0.4192850782343213, + "grad_norm": 1.5297260284423828, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8704007863998413, + "num_tokens": 125884613.0, + "step": 3296 + }, + { + "epoch": 0.41941228851291185, + "grad_norm": 1.5552512407302856, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8741550445556641, + "num_tokens": 125921526.0, + "step": 3297 + }, + { + "epoch": 0.4195394987915024, + "grad_norm": 1.5355201959609985, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8568727374076843, + "num_tokens": 125962711.0, + "step": 3298 + }, + { + "epoch": 0.41966670907009285, + "grad_norm": 1.4211509227752686, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8718447089195251, + "num_tokens": 126001530.0, + "step": 3299 + }, + { + "epoch": 0.4197939193486834, + "grad_norm": 1.6301883459091187, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8531965017318726, + "num_tokens": 126039202.0, + "step": 3300 + }, + { + "epoch": 0.4199211296272739, + "grad_norm": 1.4823087453842163, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8677594661712646, + "num_tokens": 126079330.0, + "step": 3301 + }, + { + "epoch": 0.4200483399058644, + "grad_norm": 1.573021650314331, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8552566170692444, + "num_tokens": 126119022.0, + "step": 3302 + }, + { + "epoch": 0.4201755501844549, + "grad_norm": 1.6346144676208496, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8591147065162659, + "num_tokens": 126156053.0, + "step": 3303 + }, + { + "epoch": 0.42030276046304543, + "grad_norm": 1.4519222974777222, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8621440529823303, + "num_tokens": 126195639.0, + "step": 3304 + }, + { + "epoch": 0.4204299707416359, + "grad_norm": 1.4936199188232422, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8570823669433594, + "num_tokens": 126234632.0, + "step": 3305 + }, + { + "epoch": 0.42055718102022643, + "grad_norm": 1.5928014516830444, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8514021039009094, + "num_tokens": 126272425.0, + "step": 3306 + }, + { + "epoch": 0.42068439129881696, + "grad_norm": 1.6134611368179321, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8578341603279114, + "num_tokens": 126310748.0, + "step": 3307 + }, + { + "epoch": 0.42081160157740743, + "grad_norm": 1.4533933401107788, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.855068027973175, + "num_tokens": 126353104.0, + "step": 3308 + }, + { + "epoch": 0.42093881185599796, + "grad_norm": 1.463599443435669, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.851184606552124, + "num_tokens": 126399202.0, + "step": 3309 + }, + { + "epoch": 0.4210660221345885, + "grad_norm": 1.5409126281738281, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8482276201248169, + "num_tokens": 126437669.0, + "step": 3310 + }, + { + "epoch": 0.42119323241317896, + "grad_norm": 1.5455849170684814, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8808871507644653, + "num_tokens": 126473174.0, + "step": 3311 + }, + { + "epoch": 0.4213204426917695, + "grad_norm": 1.624345064163208, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8531116843223572, + "num_tokens": 126510818.0, + "step": 3312 + }, + { + "epoch": 0.42144765297036, + "grad_norm": 1.4930522441864014, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8703876733779907, + "num_tokens": 126555063.0, + "step": 3313 + }, + { + "epoch": 0.4215748632489505, + "grad_norm": 1.689626932144165, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8691178560256958, + "num_tokens": 126592124.0, + "step": 3314 + }, + { + "epoch": 0.421702073527541, + "grad_norm": 1.4166154861450195, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8606851100921631, + "num_tokens": 126633588.0, + "step": 3315 + }, + { + "epoch": 0.42182928380613155, + "grad_norm": 1.499884843826294, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8486235737800598, + "num_tokens": 126673684.0, + "step": 3316 + }, + { + "epoch": 0.421956494084722, + "grad_norm": 1.6324251890182495, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.862230658531189, + "num_tokens": 126707611.0, + "step": 3317 + }, + { + "epoch": 0.42208370436331255, + "grad_norm": 1.501943588256836, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8615752458572388, + "num_tokens": 126747426.0, + "step": 3318 + }, + { + "epoch": 0.4222109146419031, + "grad_norm": 1.5331225395202637, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8763653039932251, + "num_tokens": 126782799.0, + "step": 3319 + }, + { + "epoch": 0.42233812492049355, + "grad_norm": 1.4000740051269531, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8753564357757568, + "num_tokens": 126822974.0, + "step": 3320 + }, + { + "epoch": 0.4224653351990841, + "grad_norm": 1.5251903533935547, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8498155474662781, + "num_tokens": 126861461.0, + "step": 3321 + }, + { + "epoch": 0.4225925454776746, + "grad_norm": 1.524922251701355, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8481197357177734, + "num_tokens": 126901899.0, + "step": 3322 + }, + { + "epoch": 0.4227197557562651, + "grad_norm": 1.544009804725647, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.862554669380188, + "num_tokens": 126942377.0, + "step": 3323 + }, + { + "epoch": 0.4228469660348556, + "grad_norm": 1.5131144523620605, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8510708212852478, + "num_tokens": 126982919.0, + "step": 3324 + }, + { + "epoch": 0.42297417631344614, + "grad_norm": 1.6755839586257935, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8767862915992737, + "num_tokens": 127014275.0, + "step": 3325 + }, + { + "epoch": 0.4231013865920366, + "grad_norm": 1.5041248798370361, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8768755793571472, + "num_tokens": 127054332.0, + "step": 3326 + }, + { + "epoch": 0.42322859687062714, + "grad_norm": 1.729725956916809, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8529322147369385, + "num_tokens": 127089223.0, + "step": 3327 + }, + { + "epoch": 0.42335580714921767, + "grad_norm": 1.513535976409912, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.869908332824707, + "num_tokens": 127125361.0, + "step": 3328 + }, + { + "epoch": 0.42348301742780814, + "grad_norm": 1.517685890197754, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8594639301300049, + "num_tokens": 127160345.0, + "step": 3329 + }, + { + "epoch": 0.42361022770639867, + "grad_norm": 1.5888656377792358, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8596174716949463, + "num_tokens": 127195078.0, + "step": 3330 + }, + { + "epoch": 0.4237374379849892, + "grad_norm": 1.6177470684051514, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8628380298614502, + "num_tokens": 127232381.0, + "step": 3331 + }, + { + "epoch": 0.4238646482635797, + "grad_norm": 1.6458580493927002, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8672964572906494, + "num_tokens": 127268731.0, + "step": 3332 + }, + { + "epoch": 0.4239918585421702, + "grad_norm": 1.6006288528442383, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.853163480758667, + "num_tokens": 127305455.0, + "step": 3333 + }, + { + "epoch": 0.4241190688207607, + "grad_norm": 1.6454730033874512, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8654537200927734, + "num_tokens": 127339924.0, + "step": 3334 + }, + { + "epoch": 0.42424627909935125, + "grad_norm": 1.5444440841674805, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8578053712844849, + "num_tokens": 127375413.0, + "step": 3335 + }, + { + "epoch": 0.4243734893779417, + "grad_norm": 1.5691322088241577, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8534492254257202, + "num_tokens": 127410872.0, + "step": 3336 + }, + { + "epoch": 0.42450069965653225, + "grad_norm": 1.4815136194229126, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.84549880027771, + "num_tokens": 127451948.0, + "step": 3337 + }, + { + "epoch": 0.4246279099351228, + "grad_norm": 4.304675102233887, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8558696508407593, + "num_tokens": 127490441.0, + "step": 3338 + }, + { + "epoch": 0.42475512021371326, + "grad_norm": 1.7728207111358643, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8733206987380981, + "num_tokens": 127530809.0, + "step": 3339 + }, + { + "epoch": 0.4248823304923038, + "grad_norm": 1.6580742597579956, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.844927966594696, + "num_tokens": 127566957.0, + "step": 3340 + }, + { + "epoch": 0.4250095407708943, + "grad_norm": 1.5203627347946167, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8644431233406067, + "num_tokens": 127604675.0, + "step": 3341 + }, + { + "epoch": 0.4251367510494848, + "grad_norm": 1.6149635314941406, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8713523745536804, + "num_tokens": 127635201.0, + "step": 3342 + }, + { + "epoch": 0.4252639613280753, + "grad_norm": 1.473873496055603, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8601859211921692, + "num_tokens": 127670321.0, + "step": 3343 + }, + { + "epoch": 0.42539117160666584, + "grad_norm": 1.4336313009262085, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8625379204750061, + "num_tokens": 127712654.0, + "step": 3344 + }, + { + "epoch": 0.4255183818852563, + "grad_norm": 1.5300565958023071, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8508864641189575, + "num_tokens": 127755051.0, + "step": 3345 + }, + { + "epoch": 0.42564559216384684, + "grad_norm": 1.523586392402649, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8746976852416992, + "num_tokens": 127793949.0, + "step": 3346 + }, + { + "epoch": 0.42577280244243737, + "grad_norm": 1.728718876838684, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8613071441650391, + "num_tokens": 127833632.0, + "step": 3347 + }, + { + "epoch": 0.42590001272102784, + "grad_norm": 1.4413771629333496, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8640350103378296, + "num_tokens": 127871756.0, + "step": 3348 + }, + { + "epoch": 0.42602722299961837, + "grad_norm": 1.520513892173767, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.86427903175354, + "num_tokens": 127908305.0, + "step": 3349 + }, + { + "epoch": 0.4261544332782089, + "grad_norm": 1.7168889045715332, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.86869215965271, + "num_tokens": 127941904.0, + "step": 3350 + }, + { + "epoch": 0.4262816435567994, + "grad_norm": 1.5287412405014038, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8556396961212158, + "num_tokens": 127984077.0, + "step": 3351 + }, + { + "epoch": 0.4264088538353899, + "grad_norm": 1.49229097366333, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8774904608726501, + "num_tokens": 128023751.0, + "step": 3352 + }, + { + "epoch": 0.42653606411398043, + "grad_norm": 1.5817153453826904, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.862002968788147, + "num_tokens": 128059496.0, + "step": 3353 + }, + { + "epoch": 0.4266632743925709, + "grad_norm": 1.484467625617981, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8689705729484558, + "num_tokens": 128098349.0, + "step": 3354 + }, + { + "epoch": 0.42679048467116143, + "grad_norm": 1.4000529050827026, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8774207830429077, + "num_tokens": 128138003.0, + "step": 3355 + }, + { + "epoch": 0.42691769494975196, + "grad_norm": 1.476251482963562, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.854610800743103, + "num_tokens": 128180833.0, + "step": 3356 + }, + { + "epoch": 0.42704490522834243, + "grad_norm": 1.4493499994277954, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8587530851364136, + "num_tokens": 128221864.0, + "step": 3357 + }, + { + "epoch": 0.42717211550693296, + "grad_norm": 1.383581519126892, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8729718923568726, + "num_tokens": 128264553.0, + "step": 3358 + }, + { + "epoch": 0.4272993257855235, + "grad_norm": 1.5090806484222412, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8508449196815491, + "num_tokens": 128308039.0, + "step": 3359 + }, + { + "epoch": 0.42742653606411396, + "grad_norm": 1.5321279764175415, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8606734275817871, + "num_tokens": 128344593.0, + "step": 3360 + }, + { + "epoch": 0.4275537463427045, + "grad_norm": 1.5662477016448975, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8744785785675049, + "num_tokens": 128379509.0, + "step": 3361 + }, + { + "epoch": 0.427680956621295, + "grad_norm": 1.5674264430999756, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8674601316452026, + "num_tokens": 128414127.0, + "step": 3362 + }, + { + "epoch": 0.4278081668998855, + "grad_norm": 1.436157464981079, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8838315010070801, + "num_tokens": 128453007.0, + "step": 3363 + }, + { + "epoch": 0.427935377178476, + "grad_norm": 1.4976977109909058, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.874030351638794, + "num_tokens": 128487872.0, + "step": 3364 + }, + { + "epoch": 0.42806258745706655, + "grad_norm": 1.5746296644210815, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8634527325630188, + "num_tokens": 128525429.0, + "step": 3365 + }, + { + "epoch": 0.428189797735657, + "grad_norm": 1.52370285987854, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8499425649642944, + "num_tokens": 128563714.0, + "step": 3366 + }, + { + "epoch": 0.42831700801424755, + "grad_norm": 1.5178481340408325, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8683973550796509, + "num_tokens": 128598260.0, + "step": 3367 + }, + { + "epoch": 0.4284442182928381, + "grad_norm": 1.493910312652588, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8624526262283325, + "num_tokens": 128641233.0, + "step": 3368 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 1.5015320777893066, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8572664260864258, + "num_tokens": 128682458.0, + "step": 3369 + }, + { + "epoch": 0.4286986388500191, + "grad_norm": 1.3933947086334229, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8586715459823608, + "num_tokens": 128726764.0, + "step": 3370 + }, + { + "epoch": 0.4288258491286096, + "grad_norm": 1.5368613004684448, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8593432903289795, + "num_tokens": 128761048.0, + "step": 3371 + }, + { + "epoch": 0.4289530594072001, + "grad_norm": 1.5916677713394165, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8699876070022583, + "num_tokens": 128794729.0, + "step": 3372 + }, + { + "epoch": 0.4290802696857906, + "grad_norm": 1.5895318984985352, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8563668727874756, + "num_tokens": 128832617.0, + "step": 3373 + }, + { + "epoch": 0.42920747996438113, + "grad_norm": 1.4867908954620361, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8710657954216003, + "num_tokens": 128869752.0, + "step": 3374 + }, + { + "epoch": 0.4293346902429716, + "grad_norm": 1.5199106931686401, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8797125816345215, + "num_tokens": 128907622.0, + "step": 3375 + }, + { + "epoch": 0.42946190052156213, + "grad_norm": 1.4949331283569336, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8559398055076599, + "num_tokens": 128944638.0, + "step": 3376 + }, + { + "epoch": 0.42958911080015266, + "grad_norm": 1.4651310443878174, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8603134751319885, + "num_tokens": 128982797.0, + "step": 3377 + }, + { + "epoch": 0.42971632107874314, + "grad_norm": 1.7024542093276978, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8377710580825806, + "num_tokens": 129017684.0, + "step": 3378 + }, + { + "epoch": 0.42984353135733366, + "grad_norm": 1.5297918319702148, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8659940958023071, + "num_tokens": 129054424.0, + "step": 3379 + }, + { + "epoch": 0.4299707416359242, + "grad_norm": 1.44756019115448, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8649238348007202, + "num_tokens": 129094874.0, + "step": 3380 + }, + { + "epoch": 0.4300979519145147, + "grad_norm": 1.5320284366607666, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8640698790550232, + "num_tokens": 129136178.0, + "step": 3381 + }, + { + "epoch": 0.4302251621931052, + "grad_norm": 1.5373260974884033, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8632417917251587, + "num_tokens": 129174678.0, + "step": 3382 + }, + { + "epoch": 0.4303523724716957, + "grad_norm": 1.4255719184875488, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8635692596435547, + "num_tokens": 129215103.0, + "step": 3383 + }, + { + "epoch": 0.43047958275028625, + "grad_norm": 1.63954758644104, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8587042689323425, + "num_tokens": 129249261.0, + "step": 3384 + }, + { + "epoch": 0.4306067930288767, + "grad_norm": 1.4042433500289917, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8657959699630737, + "num_tokens": 129291218.0, + "step": 3385 + }, + { + "epoch": 0.43073400330746725, + "grad_norm": 1.4545025825500488, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8666857481002808, + "num_tokens": 129331637.0, + "step": 3386 + }, + { + "epoch": 0.4308612135860578, + "grad_norm": 1.5887084007263184, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.861983060836792, + "num_tokens": 129365228.0, + "step": 3387 + }, + { + "epoch": 0.43098842386464825, + "grad_norm": 1.5487772226333618, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8467357754707336, + "num_tokens": 129405461.0, + "step": 3388 + }, + { + "epoch": 0.4311156341432388, + "grad_norm": 1.4258893728256226, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8594410419464111, + "num_tokens": 129447194.0, + "step": 3389 + }, + { + "epoch": 0.4312428444218293, + "grad_norm": 1.5920369625091553, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8745712041854858, + "num_tokens": 129480196.0, + "step": 3390 + }, + { + "epoch": 0.4313700547004198, + "grad_norm": 1.5338711738586426, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8545366525650024, + "num_tokens": 129514418.0, + "step": 3391 + }, + { + "epoch": 0.4314972649790103, + "grad_norm": 1.5889537334442139, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8505082130432129, + "num_tokens": 129551177.0, + "step": 3392 + }, + { + "epoch": 0.43162447525760084, + "grad_norm": 1.6113042831420898, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8507135510444641, + "num_tokens": 129588210.0, + "step": 3393 + }, + { + "epoch": 0.4317516855361913, + "grad_norm": 1.5246336460113525, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8488785028457642, + "num_tokens": 129628355.0, + "step": 3394 + }, + { + "epoch": 0.43187889581478184, + "grad_norm": 1.5105772018432617, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.862255334854126, + "num_tokens": 129667768.0, + "step": 3395 + }, + { + "epoch": 0.43200610609337237, + "grad_norm": 1.4940565824508667, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8594075441360474, + "num_tokens": 129707718.0, + "step": 3396 + }, + { + "epoch": 0.43213331637196284, + "grad_norm": 1.4183979034423828, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.870003879070282, + "num_tokens": 129746733.0, + "step": 3397 + }, + { + "epoch": 0.43226052665055337, + "grad_norm": 1.5716915130615234, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.862065315246582, + "num_tokens": 129780583.0, + "step": 3398 + }, + { + "epoch": 0.4323877369291439, + "grad_norm": 1.6131740808486938, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8723739385604858, + "num_tokens": 129813763.0, + "step": 3399 + }, + { + "epoch": 0.43251494720773437, + "grad_norm": 1.5401111841201782, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8592971563339233, + "num_tokens": 129851231.0, + "step": 3400 + }, + { + "epoch": 0.4326421574863249, + "grad_norm": 1.4405417442321777, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8751357197761536, + "num_tokens": 129892411.0, + "step": 3401 + }, + { + "epoch": 0.4327693677649154, + "grad_norm": 1.6610358953475952, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8625742197036743, + "num_tokens": 129925597.0, + "step": 3402 + }, + { + "epoch": 0.4328965780435059, + "grad_norm": 1.449947476387024, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8680412769317627, + "num_tokens": 129965462.0, + "step": 3403 + }, + { + "epoch": 0.4330237883220964, + "grad_norm": 1.523930311203003, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.852689266204834, + "num_tokens": 130004747.0, + "step": 3404 + }, + { + "epoch": 0.43315099860068695, + "grad_norm": 1.4001199007034302, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8655111193656921, + "num_tokens": 130049694.0, + "step": 3405 + }, + { + "epoch": 0.4332782088792774, + "grad_norm": 1.7126020193099976, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8504881858825684, + "num_tokens": 130084588.0, + "step": 3406 + }, + { + "epoch": 0.43340541915786795, + "grad_norm": 1.3987517356872559, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8466845750808716, + "num_tokens": 130131623.0, + "step": 3407 + }, + { + "epoch": 0.4335326294364585, + "grad_norm": 1.74496328830719, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8374394774436951, + "num_tokens": 130164355.0, + "step": 3408 + }, + { + "epoch": 0.43365983971504896, + "grad_norm": 1.5611886978149414, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.865973949432373, + "num_tokens": 130196699.0, + "step": 3409 + }, + { + "epoch": 0.4337870499936395, + "grad_norm": 1.4444705247879028, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8610918521881104, + "num_tokens": 130236054.0, + "step": 3410 + }, + { + "epoch": 0.43391426027223, + "grad_norm": 1.485877275466919, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8483351469039917, + "num_tokens": 130275854.0, + "step": 3411 + }, + { + "epoch": 0.4340414705508205, + "grad_norm": 1.4683926105499268, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.874036967754364, + "num_tokens": 130315386.0, + "step": 3412 + }, + { + "epoch": 0.434168680829411, + "grad_norm": 1.576533555984497, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8506125807762146, + "num_tokens": 130353474.0, + "step": 3413 + }, + { + "epoch": 0.43429589110800154, + "grad_norm": 1.477028489112854, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8449598550796509, + "num_tokens": 130394241.0, + "step": 3414 + }, + { + "epoch": 0.434423101386592, + "grad_norm": 1.6998344659805298, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8598068356513977, + "num_tokens": 130426502.0, + "step": 3415 + }, + { + "epoch": 0.43455031166518254, + "grad_norm": 1.618260383605957, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8685435056686401, + "num_tokens": 130461758.0, + "step": 3416 + }, + { + "epoch": 0.43467752194377307, + "grad_norm": 1.4587851762771606, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8773622512817383, + "num_tokens": 130499390.0, + "step": 3417 + }, + { + "epoch": 0.43480473222236354, + "grad_norm": 1.7252399921417236, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8578924536705017, + "num_tokens": 130530690.0, + "step": 3418 + }, + { + "epoch": 0.43493194250095407, + "grad_norm": 1.467570185661316, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8719874620437622, + "num_tokens": 130573716.0, + "step": 3419 + }, + { + "epoch": 0.4350591527795446, + "grad_norm": 1.4510716199874878, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8395761251449585, + "num_tokens": 130616886.0, + "step": 3420 + }, + { + "epoch": 0.4351863630581351, + "grad_norm": 1.5282299518585205, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8494853973388672, + "num_tokens": 130654935.0, + "step": 3421 + }, + { + "epoch": 0.4353135733367256, + "grad_norm": 1.492361307144165, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8553725481033325, + "num_tokens": 130693241.0, + "step": 3422 + }, + { + "epoch": 0.43544078361531613, + "grad_norm": 1.555698037147522, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8635890483856201, + "num_tokens": 130730977.0, + "step": 3423 + }, + { + "epoch": 0.4355679938939066, + "grad_norm": 1.557308554649353, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.87485671043396, + "num_tokens": 130763675.0, + "step": 3424 + }, + { + "epoch": 0.43569520417249713, + "grad_norm": 1.690843105316162, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8562436103820801, + "num_tokens": 130796313.0, + "step": 3425 + }, + { + "epoch": 0.43582241445108766, + "grad_norm": 1.6280224323272705, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8342665433883667, + "num_tokens": 130834728.0, + "step": 3426 + }, + { + "epoch": 0.43594962472967813, + "grad_norm": 1.5410456657409668, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8609429597854614, + "num_tokens": 130870728.0, + "step": 3427 + }, + { + "epoch": 0.43607683500826866, + "grad_norm": 1.499467134475708, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8619208335876465, + "num_tokens": 130909883.0, + "step": 3428 + }, + { + "epoch": 0.4362040452868592, + "grad_norm": 1.690023422241211, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8595430850982666, + "num_tokens": 130946440.0, + "step": 3429 + }, + { + "epoch": 0.4363312555654497, + "grad_norm": 1.6335952281951904, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8498576879501343, + "num_tokens": 130980057.0, + "step": 3430 + }, + { + "epoch": 0.4364584658440402, + "grad_norm": 1.5445740222930908, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8664398193359375, + "num_tokens": 131015336.0, + "step": 3431 + }, + { + "epoch": 0.4365856761226307, + "grad_norm": 1.5797159671783447, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8750016689300537, + "num_tokens": 131051213.0, + "step": 3432 + }, + { + "epoch": 0.43671288640122125, + "grad_norm": 1.570309042930603, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8491613864898682, + "num_tokens": 131087762.0, + "step": 3433 + }, + { + "epoch": 0.4368400966798117, + "grad_norm": 1.5604021549224854, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8706647157669067, + "num_tokens": 131124641.0, + "step": 3434 + }, + { + "epoch": 0.43696730695840225, + "grad_norm": 1.6073236465454102, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8448351621627808, + "num_tokens": 131165445.0, + "step": 3435 + }, + { + "epoch": 0.4370945172369928, + "grad_norm": 1.557352900505066, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8777929544448853, + "num_tokens": 131201590.0, + "step": 3436 + }, + { + "epoch": 0.43722172751558325, + "grad_norm": 1.5257962942123413, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.854935884475708, + "num_tokens": 131239931.0, + "step": 3437 + }, + { + "epoch": 0.4373489377941738, + "grad_norm": 1.4168035984039307, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8627724051475525, + "num_tokens": 131281763.0, + "step": 3438 + }, + { + "epoch": 0.4374761480727643, + "grad_norm": 1.4037889242172241, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8582582473754883, + "num_tokens": 131324857.0, + "step": 3439 + }, + { + "epoch": 0.4376033583513548, + "grad_norm": 1.4255127906799316, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8683911561965942, + "num_tokens": 131369461.0, + "step": 3440 + }, + { + "epoch": 0.4377305686299453, + "grad_norm": 1.4512600898742676, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8652596473693848, + "num_tokens": 131410332.0, + "step": 3441 + }, + { + "epoch": 0.43785777890853583, + "grad_norm": 1.5672988891601562, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8638396263122559, + "num_tokens": 131450207.0, + "step": 3442 + }, + { + "epoch": 0.4379849891871263, + "grad_norm": 1.5286359786987305, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8596476316452026, + "num_tokens": 131489914.0, + "step": 3443 + }, + { + "epoch": 0.43811219946571683, + "grad_norm": 1.7163318395614624, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8422273397445679, + "num_tokens": 131527729.0, + "step": 3444 + }, + { + "epoch": 0.43823940974430736, + "grad_norm": 1.4426151514053345, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8568110466003418, + "num_tokens": 131567212.0, + "step": 3445 + }, + { + "epoch": 0.43836662002289783, + "grad_norm": 1.5328911542892456, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8620551824569702, + "num_tokens": 131604840.0, + "step": 3446 + }, + { + "epoch": 0.43849383030148836, + "grad_norm": 1.455108404159546, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8841718435287476, + "num_tokens": 131642263.0, + "step": 3447 + }, + { + "epoch": 0.4386210405800789, + "grad_norm": 1.475334644317627, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8679247498512268, + "num_tokens": 131683149.0, + "step": 3448 + }, + { + "epoch": 0.43874825085866936, + "grad_norm": 1.5097236633300781, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8667911291122437, + "num_tokens": 131719839.0, + "step": 3449 + }, + { + "epoch": 0.4388754611372599, + "grad_norm": 1.59334397315979, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.856853723526001, + "num_tokens": 131757231.0, + "step": 3450 + }, + { + "epoch": 0.4390026714158504, + "grad_norm": 1.4291393756866455, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8717622756958008, + "num_tokens": 131795772.0, + "step": 3451 + }, + { + "epoch": 0.4391298816944409, + "grad_norm": 1.436247706413269, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8728532195091248, + "num_tokens": 131834955.0, + "step": 3452 + }, + { + "epoch": 0.4392570919730314, + "grad_norm": 1.5104354619979858, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8751141428947449, + "num_tokens": 131871523.0, + "step": 3453 + }, + { + "epoch": 0.43938430225162195, + "grad_norm": 1.5546103715896606, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8486577868461609, + "num_tokens": 131909645.0, + "step": 3454 + }, + { + "epoch": 0.4395115125302124, + "grad_norm": 1.5352474451065063, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8710052967071533, + "num_tokens": 131944231.0, + "step": 3455 + }, + { + "epoch": 0.43963872280880295, + "grad_norm": 1.5835164785385132, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8634138107299805, + "num_tokens": 131979761.0, + "step": 3456 + }, + { + "epoch": 0.4397659330873935, + "grad_norm": 1.5653772354125977, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8497296571731567, + "num_tokens": 132015316.0, + "step": 3457 + }, + { + "epoch": 0.43989314336598395, + "grad_norm": 1.7261425256729126, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.856184184551239, + "num_tokens": 132048703.0, + "step": 3458 + }, + { + "epoch": 0.4400203536445745, + "grad_norm": 1.6572185754776, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8667592406272888, + "num_tokens": 132082378.0, + "step": 3459 + }, + { + "epoch": 0.440147563923165, + "grad_norm": 1.4089394807815552, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8731027245521545, + "num_tokens": 132121739.0, + "step": 3460 + }, + { + "epoch": 0.4402747742017555, + "grad_norm": 1.5896339416503906, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8644134998321533, + "num_tokens": 132157240.0, + "step": 3461 + }, + { + "epoch": 0.440401984480346, + "grad_norm": 1.4948283433914185, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8637220859527588, + "num_tokens": 132198889.0, + "step": 3462 + }, + { + "epoch": 0.44052919475893654, + "grad_norm": 1.6062486171722412, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8525785207748413, + "num_tokens": 132236093.0, + "step": 3463 + }, + { + "epoch": 0.440656405037527, + "grad_norm": 1.5422183275222778, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8640156984329224, + "num_tokens": 132271404.0, + "step": 3464 + }, + { + "epoch": 0.44078361531611754, + "grad_norm": 1.6632535457611084, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8696383237838745, + "num_tokens": 132302599.0, + "step": 3465 + }, + { + "epoch": 0.44091082559470807, + "grad_norm": 1.532100796699524, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8619134426116943, + "num_tokens": 132342518.0, + "step": 3466 + }, + { + "epoch": 0.44103803587329854, + "grad_norm": 1.6583045721054077, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8435728549957275, + "num_tokens": 132377864.0, + "step": 3467 + }, + { + "epoch": 0.44116524615188907, + "grad_norm": 1.5012913942337036, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8566304445266724, + "num_tokens": 132420473.0, + "step": 3468 + }, + { + "epoch": 0.4412924564304796, + "grad_norm": 1.4560600519180298, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8714231252670288, + "num_tokens": 132460642.0, + "step": 3469 + }, + { + "epoch": 0.44141966670907007, + "grad_norm": 1.4140124320983887, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8696773052215576, + "num_tokens": 132500344.0, + "step": 3470 + }, + { + "epoch": 0.4415468769876606, + "grad_norm": 1.501504898071289, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8495333194732666, + "num_tokens": 132541045.0, + "step": 3471 + }, + { + "epoch": 0.4416740872662511, + "grad_norm": 1.6289591789245605, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8541474342346191, + "num_tokens": 132576615.0, + "step": 3472 + }, + { + "epoch": 0.4418012975448416, + "grad_norm": 1.7552300691604614, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8591561317443848, + "num_tokens": 132607946.0, + "step": 3473 + }, + { + "epoch": 0.4419285078234321, + "grad_norm": 1.4705400466918945, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8749716281890869, + "num_tokens": 132647990.0, + "step": 3474 + }, + { + "epoch": 0.44205571810202265, + "grad_norm": 1.7319527864456177, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8533421158790588, + "num_tokens": 132691051.0, + "step": 3475 + }, + { + "epoch": 0.4421829283806131, + "grad_norm": 1.4839873313903809, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8543827533721924, + "num_tokens": 132728004.0, + "step": 3476 + }, + { + "epoch": 0.44231013865920366, + "grad_norm": 1.5207304954528809, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8643506765365601, + "num_tokens": 132762272.0, + "step": 3477 + }, + { + "epoch": 0.4424373489377942, + "grad_norm": 1.6290680170059204, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8358052968978882, + "num_tokens": 132797957.0, + "step": 3478 + }, + { + "epoch": 0.44256455921638466, + "grad_norm": 1.517220139503479, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8606518507003784, + "num_tokens": 132833069.0, + "step": 3479 + }, + { + "epoch": 0.4426917694949752, + "grad_norm": 1.5206456184387207, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8611065149307251, + "num_tokens": 132872389.0, + "step": 3480 + }, + { + "epoch": 0.4428189797735657, + "grad_norm": 1.5043426752090454, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8577700257301331, + "num_tokens": 132915391.0, + "step": 3481 + }, + { + "epoch": 0.44294619005215624, + "grad_norm": 1.6301146745681763, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.859416127204895, + "num_tokens": 132950013.0, + "step": 3482 + }, + { + "epoch": 0.4430734003307467, + "grad_norm": 1.6264232397079468, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8601745963096619, + "num_tokens": 132982552.0, + "step": 3483 + }, + { + "epoch": 0.44320061060933724, + "grad_norm": 1.670348048210144, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8637859225273132, + "num_tokens": 133018609.0, + "step": 3484 + }, + { + "epoch": 0.44332782088792777, + "grad_norm": 1.5375571250915527, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8555729389190674, + "num_tokens": 133058499.0, + "step": 3485 + }, + { + "epoch": 0.44345503116651824, + "grad_norm": 1.6758984327316284, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8508307933807373, + "num_tokens": 133089359.0, + "step": 3486 + }, + { + "epoch": 0.44358224144510877, + "grad_norm": 1.4590613842010498, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.856403648853302, + "num_tokens": 133130601.0, + "step": 3487 + }, + { + "epoch": 0.4437094517236993, + "grad_norm": 1.5448532104492188, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8631777763366699, + "num_tokens": 133167380.0, + "step": 3488 + }, + { + "epoch": 0.4438366620022898, + "grad_norm": 1.3969987630844116, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8566700220108032, + "num_tokens": 133214845.0, + "step": 3489 + }, + { + "epoch": 0.4439638722808803, + "grad_norm": 1.6163970232009888, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8689060211181641, + "num_tokens": 133246249.0, + "step": 3490 + }, + { + "epoch": 0.44409108255947083, + "grad_norm": 1.599074125289917, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8460878729820251, + "num_tokens": 133281943.0, + "step": 3491 + }, + { + "epoch": 0.4442182928380613, + "grad_norm": 1.5072377920150757, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.865315854549408, + "num_tokens": 133319808.0, + "step": 3492 + }, + { + "epoch": 0.44434550311665183, + "grad_norm": 1.5896053314208984, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8499969244003296, + "num_tokens": 133354752.0, + "step": 3493 + }, + { + "epoch": 0.44447271339524236, + "grad_norm": 1.6342830657958984, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8532421588897705, + "num_tokens": 133388388.0, + "step": 3494 + }, + { + "epoch": 0.44459992367383283, + "grad_norm": 1.5325019359588623, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8654729723930359, + "num_tokens": 133428225.0, + "step": 3495 + }, + { + "epoch": 0.44472713395242336, + "grad_norm": 1.5679832696914673, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.863551914691925, + "num_tokens": 133463052.0, + "step": 3496 + }, + { + "epoch": 0.4448543442310139, + "grad_norm": 1.5867596864700317, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8717051148414612, + "num_tokens": 133496323.0, + "step": 3497 + }, + { + "epoch": 0.44498155450960436, + "grad_norm": 1.7188358306884766, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8769943714141846, + "num_tokens": 133525267.0, + "step": 3498 + }, + { + "epoch": 0.4451087647881949, + "grad_norm": 1.5024199485778809, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8579851984977722, + "num_tokens": 133564285.0, + "step": 3499 + }, + { + "epoch": 0.4452359750667854, + "grad_norm": 1.6416492462158203, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8567783236503601, + "num_tokens": 133600708.0, + "step": 3500 + }, + { + "epoch": 0.4453631853453759, + "grad_norm": 1.5412144660949707, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.872855544090271, + "num_tokens": 133635121.0, + "step": 3501 + }, + { + "epoch": 0.4454903956239664, + "grad_norm": 1.4164505004882812, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8734798431396484, + "num_tokens": 133675052.0, + "step": 3502 + }, + { + "epoch": 0.44561760590255695, + "grad_norm": 1.6293939352035522, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8684189915657043, + "num_tokens": 133712606.0, + "step": 3503 + }, + { + "epoch": 0.4457448161811474, + "grad_norm": 1.5280922651290894, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8506371974945068, + "num_tokens": 133755137.0, + "step": 3504 + }, + { + "epoch": 0.44587202645973795, + "grad_norm": 1.47752046585083, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8641024827957153, + "num_tokens": 133791099.0, + "step": 3505 + }, + { + "epoch": 0.4459992367383285, + "grad_norm": 1.4627678394317627, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.85981285572052, + "num_tokens": 133830615.0, + "step": 3506 + }, + { + "epoch": 0.44612644701691895, + "grad_norm": 1.5482943058013916, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8583727478981018, + "num_tokens": 133868813.0, + "step": 3507 + }, + { + "epoch": 0.4462536572955095, + "grad_norm": 1.4888771772384644, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.857198178768158, + "num_tokens": 133907726.0, + "step": 3508 + }, + { + "epoch": 0.4463808675741, + "grad_norm": 1.6012107133865356, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8652616143226624, + "num_tokens": 133945679.0, + "step": 3509 + }, + { + "epoch": 0.4465080778526905, + "grad_norm": 1.6177952289581299, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8632693886756897, + "num_tokens": 133984533.0, + "step": 3510 + }, + { + "epoch": 0.446635288131281, + "grad_norm": 1.5120874643325806, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8550282716751099, + "num_tokens": 134022911.0, + "step": 3511 + }, + { + "epoch": 0.44676249840987153, + "grad_norm": 1.6880865097045898, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8684080243110657, + "num_tokens": 134059740.0, + "step": 3512 + }, + { + "epoch": 0.446889708688462, + "grad_norm": 1.4271090030670166, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.88349449634552, + "num_tokens": 134099953.0, + "step": 3513 + }, + { + "epoch": 0.44701691896705253, + "grad_norm": 1.4655872583389282, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8672219514846802, + "num_tokens": 134136271.0, + "step": 3514 + }, + { + "epoch": 0.44714412924564306, + "grad_norm": 1.6571801900863647, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8660613894462585, + "num_tokens": 134172972.0, + "step": 3515 + }, + { + "epoch": 0.44727133952423354, + "grad_norm": 1.5721784830093384, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.84786456823349, + "num_tokens": 134208586.0, + "step": 3516 + }, + { + "epoch": 0.44739854980282406, + "grad_norm": 1.6990424394607544, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8387424945831299, + "num_tokens": 134248556.0, + "step": 3517 + }, + { + "epoch": 0.4475257600814146, + "grad_norm": 1.6035749912261963, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8730063438415527, + "num_tokens": 134286866.0, + "step": 3518 + }, + { + "epoch": 0.44765297036000506, + "grad_norm": 1.5705982446670532, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8640479445457458, + "num_tokens": 134322032.0, + "step": 3519 + }, + { + "epoch": 0.4477801806385956, + "grad_norm": 1.4979342222213745, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8581447005271912, + "num_tokens": 134357407.0, + "step": 3520 + }, + { + "epoch": 0.4479073909171861, + "grad_norm": 1.551176905632019, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8501831293106079, + "num_tokens": 134396886.0, + "step": 3521 + }, + { + "epoch": 0.4480346011957766, + "grad_norm": 1.5392073392868042, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8633201718330383, + "num_tokens": 134432503.0, + "step": 3522 + }, + { + "epoch": 0.4481618114743671, + "grad_norm": 1.4876073598861694, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8536970615386963, + "num_tokens": 134472657.0, + "step": 3523 + }, + { + "epoch": 0.44828902175295765, + "grad_norm": 1.5622203350067139, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8625614047050476, + "num_tokens": 134511978.0, + "step": 3524 + }, + { + "epoch": 0.4484162320315481, + "grad_norm": 1.602946400642395, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8724519610404968, + "num_tokens": 134547316.0, + "step": 3525 + }, + { + "epoch": 0.44854344231013865, + "grad_norm": 1.4162603616714478, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8547660112380981, + "num_tokens": 134593715.0, + "step": 3526 + }, + { + "epoch": 0.4486706525887292, + "grad_norm": 1.435486078262329, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8772760629653931, + "num_tokens": 134632999.0, + "step": 3527 + }, + { + "epoch": 0.44879786286731965, + "grad_norm": 1.61510169506073, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8608459234237671, + "num_tokens": 134669832.0, + "step": 3528 + }, + { + "epoch": 0.4489250731459102, + "grad_norm": 1.4866344928741455, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8463510274887085, + "num_tokens": 134709064.0, + "step": 3529 + }, + { + "epoch": 0.4490522834245007, + "grad_norm": 1.4721499681472778, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8588879108428955, + "num_tokens": 134746702.0, + "step": 3530 + }, + { + "epoch": 0.44917949370309124, + "grad_norm": 1.678973913192749, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.86918044090271, + "num_tokens": 134786754.0, + "step": 3531 + }, + { + "epoch": 0.4493067039816817, + "grad_norm": 1.5793612003326416, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8792392015457153, + "num_tokens": 134825915.0, + "step": 3532 + }, + { + "epoch": 0.44943391426027224, + "grad_norm": 1.3770496845245361, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8762522339820862, + "num_tokens": 134870390.0, + "step": 3533 + }, + { + "epoch": 0.44956112453886277, + "grad_norm": 1.4696030616760254, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8641690015792847, + "num_tokens": 134911604.0, + "step": 3534 + }, + { + "epoch": 0.44968833481745324, + "grad_norm": 1.4835927486419678, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8742662072181702, + "num_tokens": 134950647.0, + "step": 3535 + }, + { + "epoch": 0.44981554509604377, + "grad_norm": 1.7612323760986328, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8737205266952515, + "num_tokens": 134981812.0, + "step": 3536 + }, + { + "epoch": 0.4499427553746343, + "grad_norm": 1.460241436958313, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8567215800285339, + "num_tokens": 135020953.0, + "step": 3537 + }, + { + "epoch": 0.45006996565322477, + "grad_norm": 1.6280021667480469, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8681575655937195, + "num_tokens": 135059386.0, + "step": 3538 + }, + { + "epoch": 0.4501971759318153, + "grad_norm": 1.6039570569992065, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8521819114685059, + "num_tokens": 135095407.0, + "step": 3539 + }, + { + "epoch": 0.4503243862104058, + "grad_norm": 1.596382737159729, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8851050138473511, + "num_tokens": 135126166.0, + "step": 3540 + }, + { + "epoch": 0.4504515964889963, + "grad_norm": 1.5663045644760132, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8562171459197998, + "num_tokens": 135166425.0, + "step": 3541 + }, + { + "epoch": 0.4505788067675868, + "grad_norm": 1.495908260345459, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8609086275100708, + "num_tokens": 135206480.0, + "step": 3542 + }, + { + "epoch": 0.45070601704617735, + "grad_norm": 1.614401936531067, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8476381301879883, + "num_tokens": 135245807.0, + "step": 3543 + }, + { + "epoch": 0.4508332273247678, + "grad_norm": 1.586928129196167, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8574979901313782, + "num_tokens": 135282785.0, + "step": 3544 + }, + { + "epoch": 0.45096043760335836, + "grad_norm": 1.6078579425811768, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8715567588806152, + "num_tokens": 135316379.0, + "step": 3545 + }, + { + "epoch": 0.4510876478819489, + "grad_norm": 1.5671021938323975, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8762757778167725, + "num_tokens": 135348433.0, + "step": 3546 + }, + { + "epoch": 0.45121485816053936, + "grad_norm": 1.559178352355957, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8772028684616089, + "num_tokens": 135380804.0, + "step": 3547 + }, + { + "epoch": 0.4513420684391299, + "grad_norm": 1.4235771894454956, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8700777888298035, + "num_tokens": 135420957.0, + "step": 3548 + }, + { + "epoch": 0.4514692787177204, + "grad_norm": 1.5271613597869873, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8692831993103027, + "num_tokens": 135461221.0, + "step": 3549 + }, + { + "epoch": 0.4515964889963109, + "grad_norm": 1.6148301362991333, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8535616397857666, + "num_tokens": 135497213.0, + "step": 3550 + }, + { + "epoch": 0.4517236992749014, + "grad_norm": 1.4925346374511719, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8622527718544006, + "num_tokens": 135537214.0, + "step": 3551 + }, + { + "epoch": 0.45185090955349194, + "grad_norm": 1.6149885654449463, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8666254281997681, + "num_tokens": 135570789.0, + "step": 3552 + }, + { + "epoch": 0.4519781198320824, + "grad_norm": 1.4204000234603882, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8611674308776855, + "num_tokens": 135612647.0, + "step": 3553 + }, + { + "epoch": 0.45210533011067294, + "grad_norm": 1.5241469144821167, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8536321520805359, + "num_tokens": 135649663.0, + "step": 3554 + }, + { + "epoch": 0.45223254038926347, + "grad_norm": 1.7552934885025024, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8540997505187988, + "num_tokens": 135678966.0, + "step": 3555 + }, + { + "epoch": 0.45235975066785394, + "grad_norm": 1.5024751424789429, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.84332275390625, + "num_tokens": 135718377.0, + "step": 3556 + }, + { + "epoch": 0.45248696094644447, + "grad_norm": 1.5593717098236084, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8574072122573853, + "num_tokens": 135754597.0, + "step": 3557 + }, + { + "epoch": 0.452614171225035, + "grad_norm": 1.409521222114563, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8653331995010376, + "num_tokens": 135796501.0, + "step": 3558 + }, + { + "epoch": 0.4527413815036255, + "grad_norm": 1.535446286201477, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8594870567321777, + "num_tokens": 135833696.0, + "step": 3559 + }, + { + "epoch": 0.452868591782216, + "grad_norm": 1.70292329788208, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.845937192440033, + "num_tokens": 135868867.0, + "step": 3560 + }, + { + "epoch": 0.45299580206080653, + "grad_norm": 1.3929966688156128, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8706932663917542, + "num_tokens": 135911227.0, + "step": 3561 + }, + { + "epoch": 0.453123012339397, + "grad_norm": 1.4648821353912354, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.862521767616272, + "num_tokens": 135950549.0, + "step": 3562 + }, + { + "epoch": 0.45325022261798753, + "grad_norm": 1.5050525665283203, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8538608551025391, + "num_tokens": 135988838.0, + "step": 3563 + }, + { + "epoch": 0.45337743289657806, + "grad_norm": 1.4062612056732178, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8743443489074707, + "num_tokens": 136028707.0, + "step": 3564 + }, + { + "epoch": 0.45350464317516853, + "grad_norm": 1.4804195165634155, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8715068101882935, + "num_tokens": 136064630.0, + "step": 3565 + }, + { + "epoch": 0.45363185345375906, + "grad_norm": 1.534994125366211, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8557953834533691, + "num_tokens": 136105700.0, + "step": 3566 + }, + { + "epoch": 0.4537590637323496, + "grad_norm": 1.6689831018447876, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8710834980010986, + "num_tokens": 136138317.0, + "step": 3567 + }, + { + "epoch": 0.45388627401094006, + "grad_norm": 1.419764757156372, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8571916222572327, + "num_tokens": 136179254.0, + "step": 3568 + }, + { + "epoch": 0.4540134842895306, + "grad_norm": 1.6134787797927856, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8484493494033813, + "num_tokens": 136218108.0, + "step": 3569 + }, + { + "epoch": 0.4541406945681211, + "grad_norm": 1.6213297843933105, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8488807678222656, + "num_tokens": 136253691.0, + "step": 3570 + }, + { + "epoch": 0.4542679048467116, + "grad_norm": 1.4178721904754639, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.877025842666626, + "num_tokens": 136291761.0, + "step": 3571 + }, + { + "epoch": 0.4543951151253021, + "grad_norm": 1.4643170833587646, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8603097200393677, + "num_tokens": 136332045.0, + "step": 3572 + }, + { + "epoch": 0.45452232540389265, + "grad_norm": 1.4853843450546265, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8595086336135864, + "num_tokens": 136369377.0, + "step": 3573 + }, + { + "epoch": 0.4546495356824831, + "grad_norm": 1.3923120498657227, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8626638054847717, + "num_tokens": 136415042.0, + "step": 3574 + }, + { + "epoch": 0.45477674596107365, + "grad_norm": 1.5267846584320068, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8703309893608093, + "num_tokens": 136450201.0, + "step": 3575 + }, + { + "epoch": 0.4549039562396642, + "grad_norm": 1.4164249897003174, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8648462295532227, + "num_tokens": 136491343.0, + "step": 3576 + }, + { + "epoch": 0.45503116651825465, + "grad_norm": 1.30483877658844, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8776886463165283, + "num_tokens": 136535304.0, + "step": 3577 + }, + { + "epoch": 0.4551583767968452, + "grad_norm": 1.4510701894760132, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.854945182800293, + "num_tokens": 136574886.0, + "step": 3578 + }, + { + "epoch": 0.4552855870754357, + "grad_norm": 1.4362061023712158, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8702778220176697, + "num_tokens": 136611250.0, + "step": 3579 + }, + { + "epoch": 0.4554127973540262, + "grad_norm": 1.373216986656189, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8594133257865906, + "num_tokens": 136657620.0, + "step": 3580 + }, + { + "epoch": 0.4555400076326167, + "grad_norm": 1.6154426336288452, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8643029928207397, + "num_tokens": 136693176.0, + "step": 3581 + }, + { + "epoch": 0.45566721791120723, + "grad_norm": 1.4966697692871094, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8556511402130127, + "num_tokens": 136732554.0, + "step": 3582 + }, + { + "epoch": 0.45579442818979776, + "grad_norm": 1.559333324432373, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8509011268615723, + "num_tokens": 136767890.0, + "step": 3583 + }, + { + "epoch": 0.45592163846838824, + "grad_norm": 1.6332893371582031, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8767763376235962, + "num_tokens": 136799221.0, + "step": 3584 + }, + { + "epoch": 0.45604884874697876, + "grad_norm": 1.3956408500671387, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8704577684402466, + "num_tokens": 136843595.0, + "step": 3585 + }, + { + "epoch": 0.4561760590255693, + "grad_norm": 1.4800704717636108, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8612673282623291, + "num_tokens": 136886069.0, + "step": 3586 + }, + { + "epoch": 0.45630326930415976, + "grad_norm": 1.5557833909988403, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.85657799243927, + "num_tokens": 136922415.0, + "step": 3587 + }, + { + "epoch": 0.4564304795827503, + "grad_norm": 1.457168459892273, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8719137907028198, + "num_tokens": 136959155.0, + "step": 3588 + }, + { + "epoch": 0.4565576898613408, + "grad_norm": 1.459792137145996, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8650062084197998, + "num_tokens": 136999030.0, + "step": 3589 + }, + { + "epoch": 0.4566849001399313, + "grad_norm": 1.616115927696228, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8550567030906677, + "num_tokens": 137032029.0, + "step": 3590 + }, + { + "epoch": 0.4568121104185218, + "grad_norm": 1.593815803527832, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8545941114425659, + "num_tokens": 137069753.0, + "step": 3591 + }, + { + "epoch": 0.45693932069711235, + "grad_norm": 1.6103590726852417, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8479511141777039, + "num_tokens": 137104988.0, + "step": 3592 + }, + { + "epoch": 0.4570665309757028, + "grad_norm": 1.4472601413726807, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8734837174415588, + "num_tokens": 137143677.0, + "step": 3593 + }, + { + "epoch": 0.45719374125429335, + "grad_norm": 1.5702687501907349, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8562325239181519, + "num_tokens": 137180744.0, + "step": 3594 + }, + { + "epoch": 0.4573209515328839, + "grad_norm": 1.5789097547531128, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8690770864486694, + "num_tokens": 137217077.0, + "step": 3595 + }, + { + "epoch": 0.45744816181147435, + "grad_norm": 1.7217353582382202, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.868706226348877, + "num_tokens": 137245288.0, + "step": 3596 + }, + { + "epoch": 0.4575753720900649, + "grad_norm": 1.5034947395324707, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8556085824966431, + "num_tokens": 137283251.0, + "step": 3597 + }, + { + "epoch": 0.4577025823686554, + "grad_norm": 1.4685349464416504, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8676245212554932, + "num_tokens": 137321739.0, + "step": 3598 + }, + { + "epoch": 0.4578297926472459, + "grad_norm": 1.4919425249099731, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8736302852630615, + "num_tokens": 137360356.0, + "step": 3599 + }, + { + "epoch": 0.4579570029258364, + "grad_norm": 1.4540977478027344, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.858303427696228, + "num_tokens": 137398988.0, + "step": 3600 + }, + { + "epoch": 0.45808421320442694, + "grad_norm": 1.3718551397323608, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8784751296043396, + "num_tokens": 137439579.0, + "step": 3601 + }, + { + "epoch": 0.4582114234830174, + "grad_norm": 1.5051268339157104, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8684800267219543, + "num_tokens": 137477807.0, + "step": 3602 + }, + { + "epoch": 0.45833863376160794, + "grad_norm": 1.5878652334213257, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8492648601531982, + "num_tokens": 137516127.0, + "step": 3603 + }, + { + "epoch": 0.45846584404019847, + "grad_norm": 1.69070565700531, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8491019606590271, + "num_tokens": 137556297.0, + "step": 3604 + }, + { + "epoch": 0.45859305431878894, + "grad_norm": 1.4696003198623657, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.862918496131897, + "num_tokens": 137599396.0, + "step": 3605 + }, + { + "epoch": 0.45872026459737947, + "grad_norm": 1.4764026403427124, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8510390520095825, + "num_tokens": 137638573.0, + "step": 3606 + }, + { + "epoch": 0.45884747487597, + "grad_norm": 1.4896296262741089, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8549721240997314, + "num_tokens": 137677304.0, + "step": 3607 + }, + { + "epoch": 0.45897468515456047, + "grad_norm": 1.4162676334381104, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8787147998809814, + "num_tokens": 137715143.0, + "step": 3608 + }, + { + "epoch": 0.459101895433151, + "grad_norm": 1.6293624639511108, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8489090800285339, + "num_tokens": 137752858.0, + "step": 3609 + }, + { + "epoch": 0.4592291057117415, + "grad_norm": 1.4124515056610107, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8712263703346252, + "num_tokens": 137794560.0, + "step": 3610 + }, + { + "epoch": 0.459356315990332, + "grad_norm": 1.5100377798080444, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8642256259918213, + "num_tokens": 137835870.0, + "step": 3611 + }, + { + "epoch": 0.4594835262689225, + "grad_norm": 1.481587290763855, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8390207886695862, + "num_tokens": 137880619.0, + "step": 3612 + }, + { + "epoch": 0.45961073654751305, + "grad_norm": 1.629028081893921, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.862024188041687, + "num_tokens": 137915139.0, + "step": 3613 + }, + { + "epoch": 0.4597379468261035, + "grad_norm": 1.7023897171020508, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8520944118499756, + "num_tokens": 137951840.0, + "step": 3614 + }, + { + "epoch": 0.45986515710469406, + "grad_norm": 1.5055993795394897, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8568634986877441, + "num_tokens": 137990311.0, + "step": 3615 + }, + { + "epoch": 0.4599923673832846, + "grad_norm": 1.455084204673767, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8747208118438721, + "num_tokens": 138025566.0, + "step": 3616 + }, + { + "epoch": 0.46011957766187506, + "grad_norm": 1.4257111549377441, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8616057634353638, + "num_tokens": 138067132.0, + "step": 3617 + }, + { + "epoch": 0.4602467879404656, + "grad_norm": 1.5591635704040527, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8678049445152283, + "num_tokens": 138103275.0, + "step": 3618 + }, + { + "epoch": 0.4603739982190561, + "grad_norm": 1.498204231262207, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8713613748550415, + "num_tokens": 138147178.0, + "step": 3619 + }, + { + "epoch": 0.4605012084976466, + "grad_norm": 1.533238172531128, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8667562007904053, + "num_tokens": 138182390.0, + "step": 3620 + }, + { + "epoch": 0.4606284187762371, + "grad_norm": 1.5608010292053223, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.843593180179596, + "num_tokens": 138221581.0, + "step": 3621 + }, + { + "epoch": 0.46075562905482764, + "grad_norm": 1.4703214168548584, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8783494234085083, + "num_tokens": 138257970.0, + "step": 3622 + }, + { + "epoch": 0.4608828393334181, + "grad_norm": 1.4401992559432983, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8717731833457947, + "num_tokens": 138297172.0, + "step": 3623 + }, + { + "epoch": 0.46101004961200864, + "grad_norm": 1.6941771507263184, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8507755398750305, + "num_tokens": 138329739.0, + "step": 3624 + }, + { + "epoch": 0.46113725989059917, + "grad_norm": 1.438137412071228, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8720762729644775, + "num_tokens": 138371899.0, + "step": 3625 + }, + { + "epoch": 0.46126447016918964, + "grad_norm": 1.5013296604156494, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8585237860679626, + "num_tokens": 138411903.0, + "step": 3626 + }, + { + "epoch": 0.4613916804477802, + "grad_norm": 1.5386977195739746, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8530483245849609, + "num_tokens": 138450307.0, + "step": 3627 + }, + { + "epoch": 0.4615188907263707, + "grad_norm": 1.4984623193740845, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.868944525718689, + "num_tokens": 138487824.0, + "step": 3628 + }, + { + "epoch": 0.4616461010049612, + "grad_norm": 1.5643577575683594, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8454444408416748, + "num_tokens": 138526776.0, + "step": 3629 + }, + { + "epoch": 0.4617733112835517, + "grad_norm": 1.5057474374771118, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8656125664710999, + "num_tokens": 138565769.0, + "step": 3630 + }, + { + "epoch": 0.46190052156214223, + "grad_norm": 1.392885684967041, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8652707934379578, + "num_tokens": 138607938.0, + "step": 3631 + }, + { + "epoch": 0.46202773184073276, + "grad_norm": 1.618783950805664, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8592696189880371, + "num_tokens": 138643504.0, + "step": 3632 + }, + { + "epoch": 0.46215494211932323, + "grad_norm": 1.4940770864486694, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8669338226318359, + "num_tokens": 138685085.0, + "step": 3633 + }, + { + "epoch": 0.46228215239791376, + "grad_norm": 1.5679130554199219, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8480489253997803, + "num_tokens": 138726097.0, + "step": 3634 + }, + { + "epoch": 0.4624093626765043, + "grad_norm": 1.5086511373519897, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.853191077709198, + "num_tokens": 138767157.0, + "step": 3635 + }, + { + "epoch": 0.46253657295509476, + "grad_norm": 1.6294792890548706, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8492789268493652, + "num_tokens": 138803279.0, + "step": 3636 + }, + { + "epoch": 0.4626637832336853, + "grad_norm": 1.5058025121688843, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8632707595825195, + "num_tokens": 138841467.0, + "step": 3637 + }, + { + "epoch": 0.4627909935122758, + "grad_norm": 1.5938369035720825, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8600037097930908, + "num_tokens": 138878381.0, + "step": 3638 + }, + { + "epoch": 0.4629182037908663, + "grad_norm": 1.5394121408462524, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8692052364349365, + "num_tokens": 138915453.0, + "step": 3639 + }, + { + "epoch": 0.4630454140694568, + "grad_norm": 1.4481161832809448, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.863529622554779, + "num_tokens": 138953742.0, + "step": 3640 + }, + { + "epoch": 0.46317262434804735, + "grad_norm": 1.4861122369766235, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8605107069015503, + "num_tokens": 138994346.0, + "step": 3641 + }, + { + "epoch": 0.4632998346266378, + "grad_norm": 1.4825224876403809, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8592526912689209, + "num_tokens": 139036754.0, + "step": 3642 + }, + { + "epoch": 0.46342704490522835, + "grad_norm": 1.5875656604766846, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8615386486053467, + "num_tokens": 139072311.0, + "step": 3643 + }, + { + "epoch": 0.4635542551838189, + "grad_norm": 1.5717415809631348, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8504772782325745, + "num_tokens": 139111623.0, + "step": 3644 + }, + { + "epoch": 0.46368146546240935, + "grad_norm": 1.5957250595092773, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8701202869415283, + "num_tokens": 139143798.0, + "step": 3645 + }, + { + "epoch": 0.4638086757409999, + "grad_norm": 1.5462933778762817, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8652201890945435, + "num_tokens": 139181484.0, + "step": 3646 + }, + { + "epoch": 0.4639358860195904, + "grad_norm": 1.5967557430267334, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8650189638137817, + "num_tokens": 139216190.0, + "step": 3647 + }, + { + "epoch": 0.4640630962981809, + "grad_norm": 1.4565781354904175, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8728480935096741, + "num_tokens": 139252374.0, + "step": 3648 + }, + { + "epoch": 0.4641903065767714, + "grad_norm": 1.431486964225769, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8707647323608398, + "num_tokens": 139295404.0, + "step": 3649 + }, + { + "epoch": 0.46431751685536193, + "grad_norm": 1.595784068107605, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8678517937660217, + "num_tokens": 139332539.0, + "step": 3650 + }, + { + "epoch": 0.4644447271339524, + "grad_norm": 1.5415388345718384, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8669755458831787, + "num_tokens": 139367219.0, + "step": 3651 + }, + { + "epoch": 0.46457193741254293, + "grad_norm": 1.5093237161636353, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8565795421600342, + "num_tokens": 139406478.0, + "step": 3652 + }, + { + "epoch": 0.46469914769113346, + "grad_norm": 1.417725682258606, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8712402582168579, + "num_tokens": 139448983.0, + "step": 3653 + }, + { + "epoch": 0.46482635796972394, + "grad_norm": 1.5831952095031738, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8628942966461182, + "num_tokens": 139485691.0, + "step": 3654 + }, + { + "epoch": 0.46495356824831446, + "grad_norm": 1.4715017080307007, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8617529273033142, + "num_tokens": 139526424.0, + "step": 3655 + }, + { + "epoch": 0.465080778526905, + "grad_norm": 1.6603975296020508, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8608676195144653, + "num_tokens": 139563351.0, + "step": 3656 + }, + { + "epoch": 0.46520798880549546, + "grad_norm": 1.4044896364212036, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8574739098548889, + "num_tokens": 139608857.0, + "step": 3657 + }, + { + "epoch": 0.465335199084086, + "grad_norm": 1.6681897640228271, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8495843410491943, + "num_tokens": 139646435.0, + "step": 3658 + }, + { + "epoch": 0.4654624093626765, + "grad_norm": 1.537621259689331, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8725649118423462, + "num_tokens": 139681652.0, + "step": 3659 + }, + { + "epoch": 0.465589619641267, + "grad_norm": 1.8608251810073853, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8704639673233032, + "num_tokens": 139709434.0, + "step": 3660 + }, + { + "epoch": 0.4657168299198575, + "grad_norm": 1.5109434127807617, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8624491691589355, + "num_tokens": 139747805.0, + "step": 3661 + }, + { + "epoch": 0.46584404019844805, + "grad_norm": 1.5994784832000732, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8521442413330078, + "num_tokens": 139785692.0, + "step": 3662 + }, + { + "epoch": 0.4659712504770385, + "grad_norm": 1.5370961427688599, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8580595254898071, + "num_tokens": 139821462.0, + "step": 3663 + }, + { + "epoch": 0.46609846075562905, + "grad_norm": 1.5031089782714844, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8704220056533813, + "num_tokens": 139857774.0, + "step": 3664 + }, + { + "epoch": 0.4662256710342196, + "grad_norm": 1.3958412408828735, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8548235297203064, + "num_tokens": 139902991.0, + "step": 3665 + }, + { + "epoch": 0.46635288131281005, + "grad_norm": 1.4184973239898682, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8800832033157349, + "num_tokens": 139941238.0, + "step": 3666 + }, + { + "epoch": 0.4664800915914006, + "grad_norm": 1.540717601776123, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.858320951461792, + "num_tokens": 139978895.0, + "step": 3667 + }, + { + "epoch": 0.4666073018699911, + "grad_norm": 1.5792537927627563, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8539058566093445, + "num_tokens": 140013785.0, + "step": 3668 + }, + { + "epoch": 0.4667345121485816, + "grad_norm": 1.4662705659866333, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.861576497554779, + "num_tokens": 140054553.0, + "step": 3669 + }, + { + "epoch": 0.4668617224271721, + "grad_norm": 1.4822336435317993, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8600955009460449, + "num_tokens": 140092622.0, + "step": 3670 + }, + { + "epoch": 0.46698893270576264, + "grad_norm": 1.6129504442214966, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8617212176322937, + "num_tokens": 140127162.0, + "step": 3671 + }, + { + "epoch": 0.4671161429843531, + "grad_norm": 1.6655056476593018, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8648107051849365, + "num_tokens": 140162091.0, + "step": 3672 + }, + { + "epoch": 0.46724335326294364, + "grad_norm": 1.4165089130401611, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8564265370368958, + "num_tokens": 140203520.0, + "step": 3673 + }, + { + "epoch": 0.46737056354153417, + "grad_norm": 1.695878267288208, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8673610091209412, + "num_tokens": 140237923.0, + "step": 3674 + }, + { + "epoch": 0.46749777382012464, + "grad_norm": 1.6459935903549194, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8649381399154663, + "num_tokens": 140272063.0, + "step": 3675 + }, + { + "epoch": 0.46762498409871517, + "grad_norm": 1.5672212839126587, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8637937307357788, + "num_tokens": 140308934.0, + "step": 3676 + }, + { + "epoch": 0.4677521943773057, + "grad_norm": 1.5141171216964722, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8633986115455627, + "num_tokens": 140348872.0, + "step": 3677 + }, + { + "epoch": 0.46787940465589617, + "grad_norm": 1.6149494647979736, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8574177622795105, + "num_tokens": 140385593.0, + "step": 3678 + }, + { + "epoch": 0.4680066149344867, + "grad_norm": 1.6802557706832886, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8609485626220703, + "num_tokens": 140420168.0, + "step": 3679 + }, + { + "epoch": 0.4681338252130772, + "grad_norm": 1.6711909770965576, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.850699782371521, + "num_tokens": 140454899.0, + "step": 3680 + }, + { + "epoch": 0.46826103549166775, + "grad_norm": 1.4712564945220947, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8606880903244019, + "num_tokens": 140492882.0, + "step": 3681 + }, + { + "epoch": 0.4683882457702582, + "grad_norm": 1.4913617372512817, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8652760982513428, + "num_tokens": 140531659.0, + "step": 3682 + }, + { + "epoch": 0.46851545604884876, + "grad_norm": 1.516288161277771, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8463931083679199, + "num_tokens": 140571578.0, + "step": 3683 + }, + { + "epoch": 0.4686426663274393, + "grad_norm": 1.6001604795455933, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8585917353630066, + "num_tokens": 140605423.0, + "step": 3684 + }, + { + "epoch": 0.46876987660602976, + "grad_norm": 1.3568893671035767, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8689508438110352, + "num_tokens": 140647284.0, + "step": 3685 + }, + { + "epoch": 0.4688970868846203, + "grad_norm": 1.5749250650405884, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8671269416809082, + "num_tokens": 140682074.0, + "step": 3686 + }, + { + "epoch": 0.4690242971632108, + "grad_norm": 1.4343962669372559, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8724348545074463, + "num_tokens": 140720562.0, + "step": 3687 + }, + { + "epoch": 0.4691515074418013, + "grad_norm": 1.6248475313186646, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8753598928451538, + "num_tokens": 140751968.0, + "step": 3688 + }, + { + "epoch": 0.4692787177203918, + "grad_norm": 1.4823616743087769, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.868622899055481, + "num_tokens": 140787836.0, + "step": 3689 + }, + { + "epoch": 0.46940592799898234, + "grad_norm": 1.3371366262435913, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8773808479309082, + "num_tokens": 140832076.0, + "step": 3690 + }, + { + "epoch": 0.4695331382775728, + "grad_norm": 1.477805495262146, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8702582716941833, + "num_tokens": 140869685.0, + "step": 3691 + }, + { + "epoch": 0.46966034855616334, + "grad_norm": 1.5815883874893188, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8619016408920288, + "num_tokens": 140908042.0, + "step": 3692 + }, + { + "epoch": 0.46978755883475387, + "grad_norm": 1.521164059638977, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8502652645111084, + "num_tokens": 140947097.0, + "step": 3693 + }, + { + "epoch": 0.46991476911334434, + "grad_norm": 1.3655987977981567, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.869218111038208, + "num_tokens": 140989216.0, + "step": 3694 + }, + { + "epoch": 0.47004197939193487, + "grad_norm": 1.48495614528656, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8659678101539612, + "num_tokens": 141028973.0, + "step": 3695 + }, + { + "epoch": 0.4701691896705254, + "grad_norm": 1.5496265888214111, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8597319722175598, + "num_tokens": 141065374.0, + "step": 3696 + }, + { + "epoch": 0.4702963999491159, + "grad_norm": 1.3881893157958984, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8636010885238647, + "num_tokens": 141106247.0, + "step": 3697 + }, + { + "epoch": 0.4704236102277064, + "grad_norm": 1.5959116220474243, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8529925346374512, + "num_tokens": 141141074.0, + "step": 3698 + }, + { + "epoch": 0.47055082050629693, + "grad_norm": 1.396390438079834, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8695765733718872, + "num_tokens": 141185460.0, + "step": 3699 + }, + { + "epoch": 0.4706780307848874, + "grad_norm": 1.6302011013031006, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8614859580993652, + "num_tokens": 141219092.0, + "step": 3700 + }, + { + "epoch": 0.47080524106347793, + "grad_norm": 1.4421708583831787, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.86314857006073, + "num_tokens": 141263714.0, + "step": 3701 + }, + { + "epoch": 0.47093245134206846, + "grad_norm": 1.4414350986480713, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8561068773269653, + "num_tokens": 141302140.0, + "step": 3702 + }, + { + "epoch": 0.47105966162065893, + "grad_norm": 1.5149590969085693, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8695261478424072, + "num_tokens": 141339742.0, + "step": 3703 + }, + { + "epoch": 0.47118687189924946, + "grad_norm": 1.4871584177017212, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8633756637573242, + "num_tokens": 141379502.0, + "step": 3704 + }, + { + "epoch": 0.47131408217784, + "grad_norm": 1.6251784563064575, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8723910450935364, + "num_tokens": 141410994.0, + "step": 3705 + }, + { + "epoch": 0.47144129245643046, + "grad_norm": 1.606200098991394, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8632000684738159, + "num_tokens": 141444943.0, + "step": 3706 + }, + { + "epoch": 0.471568502735021, + "grad_norm": 1.4520044326782227, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8676030039787292, + "num_tokens": 141484778.0, + "step": 3707 + }, + { + "epoch": 0.4716957130136115, + "grad_norm": 1.6430615186691284, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8433971405029297, + "num_tokens": 141519420.0, + "step": 3708 + }, + { + "epoch": 0.471822923292202, + "grad_norm": 1.479248285293579, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8537507057189941, + "num_tokens": 141559680.0, + "step": 3709 + }, + { + "epoch": 0.4719501335707925, + "grad_norm": 1.6755163669586182, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8473156690597534, + "num_tokens": 141592630.0, + "step": 3710 + }, + { + "epoch": 0.47207734384938305, + "grad_norm": 1.4407840967178345, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8591920733451843, + "num_tokens": 141632959.0, + "step": 3711 + }, + { + "epoch": 0.4722045541279735, + "grad_norm": 1.5114095211029053, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8557643294334412, + "num_tokens": 141671199.0, + "step": 3712 + }, + { + "epoch": 0.47233176440656405, + "grad_norm": 1.4455617666244507, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8736869096755981, + "num_tokens": 141709756.0, + "step": 3713 + }, + { + "epoch": 0.4724589746851546, + "grad_norm": 1.5575203895568848, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8721256852149963, + "num_tokens": 141748105.0, + "step": 3714 + }, + { + "epoch": 0.47258618496374505, + "grad_norm": 1.5409226417541504, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8550060987472534, + "num_tokens": 141785915.0, + "step": 3715 + }, + { + "epoch": 0.4727133952423356, + "grad_norm": 1.4420112371444702, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8603601455688477, + "num_tokens": 141827064.0, + "step": 3716 + }, + { + "epoch": 0.4728406055209261, + "grad_norm": 1.4393235445022583, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8610475659370422, + "num_tokens": 141868391.0, + "step": 3717 + }, + { + "epoch": 0.4729678157995166, + "grad_norm": 1.4077363014221191, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8516039252281189, + "num_tokens": 141910751.0, + "step": 3718 + }, + { + "epoch": 0.4730950260781071, + "grad_norm": 1.6634865999221802, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8613852262496948, + "num_tokens": 141945375.0, + "step": 3719 + }, + { + "epoch": 0.47322223635669763, + "grad_norm": 1.3676350116729736, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8668327331542969, + "num_tokens": 141990647.0, + "step": 3720 + }, + { + "epoch": 0.4733494466352881, + "grad_norm": 1.4304783344268799, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8836617469787598, + "num_tokens": 142031729.0, + "step": 3721 + }, + { + "epoch": 0.47347665691387864, + "grad_norm": 1.5186748504638672, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8562933206558228, + "num_tokens": 142070287.0, + "step": 3722 + }, + { + "epoch": 0.47360386719246916, + "grad_norm": 1.3916850090026855, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8760005235671997, + "num_tokens": 142115778.0, + "step": 3723 + }, + { + "epoch": 0.47373107747105964, + "grad_norm": 1.4940937757492065, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8686689138412476, + "num_tokens": 142153836.0, + "step": 3724 + }, + { + "epoch": 0.47385828774965016, + "grad_norm": 1.4778717756271362, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8646491765975952, + "num_tokens": 142193412.0, + "step": 3725 + }, + { + "epoch": 0.4739854980282407, + "grad_norm": 1.6162787675857544, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8759092092514038, + "num_tokens": 142231966.0, + "step": 3726 + }, + { + "epoch": 0.47411270830683117, + "grad_norm": 1.5955588817596436, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8658804893493652, + "num_tokens": 142268389.0, + "step": 3727 + }, + { + "epoch": 0.4742399185854217, + "grad_norm": 1.6003811359405518, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8703269958496094, + "num_tokens": 142303163.0, + "step": 3728 + }, + { + "epoch": 0.4743671288640122, + "grad_norm": 1.6713649034500122, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8658887147903442, + "num_tokens": 142336558.0, + "step": 3729 + }, + { + "epoch": 0.4744943391426027, + "grad_norm": 1.5493959188461304, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8697855472564697, + "num_tokens": 142372313.0, + "step": 3730 + }, + { + "epoch": 0.4746215494211932, + "grad_norm": 1.4823089838027954, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.870922863483429, + "num_tokens": 142408489.0, + "step": 3731 + }, + { + "epoch": 0.47474875969978375, + "grad_norm": 1.4996261596679688, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8742908239364624, + "num_tokens": 142444027.0, + "step": 3732 + }, + { + "epoch": 0.4748759699783743, + "grad_norm": 1.6984201669692993, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8609539270401001, + "num_tokens": 142478702.0, + "step": 3733 + }, + { + "epoch": 0.47500318025696475, + "grad_norm": 1.4526004791259766, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8652334809303284, + "num_tokens": 142520641.0, + "step": 3734 + }, + { + "epoch": 0.4751303905355553, + "grad_norm": 1.5510610342025757, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8666645288467407, + "num_tokens": 142555806.0, + "step": 3735 + }, + { + "epoch": 0.4752576008141458, + "grad_norm": 1.4135864973068237, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8700388669967651, + "num_tokens": 142596727.0, + "step": 3736 + }, + { + "epoch": 0.4753848110927363, + "grad_norm": 1.4111846685409546, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8689935207366943, + "num_tokens": 142638766.0, + "step": 3737 + }, + { + "epoch": 0.4755120213713268, + "grad_norm": 1.5000483989715576, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8538049459457397, + "num_tokens": 142676257.0, + "step": 3738 + }, + { + "epoch": 0.47563923164991734, + "grad_norm": 1.4785321950912476, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8679980039596558, + "num_tokens": 142718690.0, + "step": 3739 + }, + { + "epoch": 0.4757664419285078, + "grad_norm": 1.408851981163025, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.874906599521637, + "num_tokens": 142755985.0, + "step": 3740 + }, + { + "epoch": 0.47589365220709834, + "grad_norm": 1.5658191442489624, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8582473993301392, + "num_tokens": 142794222.0, + "step": 3741 + }, + { + "epoch": 0.47602086248568887, + "grad_norm": 1.5228371620178223, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8529736995697021, + "num_tokens": 142832105.0, + "step": 3742 + }, + { + "epoch": 0.47614807276427934, + "grad_norm": 1.4954150915145874, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8741945028305054, + "num_tokens": 142871402.0, + "step": 3743 + }, + { + "epoch": 0.47627528304286987, + "grad_norm": 1.4202942848205566, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8772724866867065, + "num_tokens": 142910418.0, + "step": 3744 + }, + { + "epoch": 0.4764024933214604, + "grad_norm": 1.392242670059204, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8738273978233337, + "num_tokens": 142952505.0, + "step": 3745 + }, + { + "epoch": 0.47652970360005087, + "grad_norm": 1.4753944873809814, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8642235994338989, + "num_tokens": 142996082.0, + "step": 3746 + }, + { + "epoch": 0.4766569138786414, + "grad_norm": 1.4403390884399414, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8636788129806519, + "num_tokens": 143037482.0, + "step": 3747 + }, + { + "epoch": 0.4767841241572319, + "grad_norm": 1.6020190715789795, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8773996233940125, + "num_tokens": 143069370.0, + "step": 3748 + }, + { + "epoch": 0.4769113344358224, + "grad_norm": 1.4691195487976074, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.854982852935791, + "num_tokens": 143111037.0, + "step": 3749 + }, + { + "epoch": 0.4770385447144129, + "grad_norm": 1.5905810594558716, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8488913774490356, + "num_tokens": 143147168.0, + "step": 3750 + }, + { + "epoch": 0.47716575499300345, + "grad_norm": 1.3887137174606323, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8632968664169312, + "num_tokens": 143191616.0, + "step": 3751 + }, + { + "epoch": 0.4772929652715939, + "grad_norm": 1.5950673818588257, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8540849685668945, + "num_tokens": 143230229.0, + "step": 3752 + }, + { + "epoch": 0.47742017555018446, + "grad_norm": 1.443324089050293, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8888596296310425, + "num_tokens": 143267838.0, + "step": 3753 + }, + { + "epoch": 0.477547385828775, + "grad_norm": 1.4404399394989014, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8717647194862366, + "num_tokens": 143309566.0, + "step": 3754 + }, + { + "epoch": 0.47767459610736546, + "grad_norm": 1.639948844909668, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8544818162918091, + "num_tokens": 143343564.0, + "step": 3755 + }, + { + "epoch": 0.477801806385956, + "grad_norm": 1.5248126983642578, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8560550212860107, + "num_tokens": 143388195.0, + "step": 3756 + }, + { + "epoch": 0.4779290166645465, + "grad_norm": 1.5044918060302734, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8699318170547485, + "num_tokens": 143425451.0, + "step": 3757 + }, + { + "epoch": 0.478056226943137, + "grad_norm": 1.6772849559783936, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8551398515701294, + "num_tokens": 143459024.0, + "step": 3758 + }, + { + "epoch": 0.4781834372217275, + "grad_norm": 1.400844931602478, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8617096543312073, + "num_tokens": 143506583.0, + "step": 3759 + }, + { + "epoch": 0.47831064750031804, + "grad_norm": 1.5411531925201416, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8615425825119019, + "num_tokens": 143546556.0, + "step": 3760 + }, + { + "epoch": 0.4784378577789085, + "grad_norm": 1.6076819896697998, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8651291728019714, + "num_tokens": 143579668.0, + "step": 3761 + }, + { + "epoch": 0.47856506805749904, + "grad_norm": 1.4890130758285522, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8546538352966309, + "num_tokens": 143622220.0, + "step": 3762 + }, + { + "epoch": 0.47869227833608957, + "grad_norm": 1.6873003244400024, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8610142469406128, + "num_tokens": 143657289.0, + "step": 3763 + }, + { + "epoch": 0.47881948861468004, + "grad_norm": 1.512503743171692, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8667513728141785, + "num_tokens": 143692218.0, + "step": 3764 + }, + { + "epoch": 0.4789466988932706, + "grad_norm": 1.555214285850525, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8761280179023743, + "num_tokens": 143728361.0, + "step": 3765 + }, + { + "epoch": 0.4790739091718611, + "grad_norm": 1.5786263942718506, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8599680662155151, + "num_tokens": 143762958.0, + "step": 3766 + }, + { + "epoch": 0.4792011194504516, + "grad_norm": 1.5074255466461182, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8603792190551758, + "num_tokens": 143802088.0, + "step": 3767 + }, + { + "epoch": 0.4793283297290421, + "grad_norm": 1.5109704732894897, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8321806788444519, + "num_tokens": 143844643.0, + "step": 3768 + }, + { + "epoch": 0.47945554000763263, + "grad_norm": 1.6621111631393433, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8562383651733398, + "num_tokens": 143876781.0, + "step": 3769 + }, + { + "epoch": 0.4795827502862231, + "grad_norm": 1.6077499389648438, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8511572480201721, + "num_tokens": 143914987.0, + "step": 3770 + }, + { + "epoch": 0.47970996056481363, + "grad_norm": 1.5618841648101807, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8512102365493774, + "num_tokens": 143954108.0, + "step": 3771 + }, + { + "epoch": 0.47983717084340416, + "grad_norm": 1.5778142213821411, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8589490652084351, + "num_tokens": 143991200.0, + "step": 3772 + }, + { + "epoch": 0.47996438112199463, + "grad_norm": 1.5029670000076294, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8636988401412964, + "num_tokens": 144027477.0, + "step": 3773 + }, + { + "epoch": 0.48009159140058516, + "grad_norm": 1.5582618713378906, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8714058995246887, + "num_tokens": 144060554.0, + "step": 3774 + }, + { + "epoch": 0.4802188016791757, + "grad_norm": 1.477901816368103, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8491664528846741, + "num_tokens": 144106153.0, + "step": 3775 + }, + { + "epoch": 0.48034601195776616, + "grad_norm": 1.5097107887268066, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8729512691497803, + "num_tokens": 144143062.0, + "step": 3776 + }, + { + "epoch": 0.4804732222363567, + "grad_norm": 1.4470620155334473, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8719960451126099, + "num_tokens": 144182100.0, + "step": 3777 + }, + { + "epoch": 0.4806004325149472, + "grad_norm": 1.5086945295333862, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8514124155044556, + "num_tokens": 144224021.0, + "step": 3778 + }, + { + "epoch": 0.4807276427935377, + "grad_norm": 1.4488483667373657, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8627203702926636, + "num_tokens": 144267633.0, + "step": 3779 + }, + { + "epoch": 0.4808548530721282, + "grad_norm": 1.516977071762085, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8652142286300659, + "num_tokens": 144304289.0, + "step": 3780 + }, + { + "epoch": 0.48098206335071875, + "grad_norm": 1.6102161407470703, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8558867573738098, + "num_tokens": 144339822.0, + "step": 3781 + }, + { + "epoch": 0.4811092736293093, + "grad_norm": 1.4278907775878906, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.85991370677948, + "num_tokens": 144381876.0, + "step": 3782 + }, + { + "epoch": 0.48123648390789975, + "grad_norm": 1.523687720298767, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8498233556747437, + "num_tokens": 144422987.0, + "step": 3783 + }, + { + "epoch": 0.4813636941864903, + "grad_norm": 1.6993393898010254, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8649008274078369, + "num_tokens": 144452411.0, + "step": 3784 + }, + { + "epoch": 0.4814909044650808, + "grad_norm": 1.4562675952911377, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8723081946372986, + "num_tokens": 144491668.0, + "step": 3785 + }, + { + "epoch": 0.4816181147436713, + "grad_norm": 1.4546735286712646, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8631255626678467, + "num_tokens": 144530710.0, + "step": 3786 + }, + { + "epoch": 0.4817453250222618, + "grad_norm": 1.5512620210647583, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8693812489509583, + "num_tokens": 144569264.0, + "step": 3787 + }, + { + "epoch": 0.48187253530085233, + "grad_norm": 1.5463768243789673, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.858447253704071, + "num_tokens": 144607693.0, + "step": 3788 + }, + { + "epoch": 0.4819997455794428, + "grad_norm": 1.461793303489685, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8611223697662354, + "num_tokens": 144654336.0, + "step": 3789 + }, + { + "epoch": 0.48212695585803333, + "grad_norm": 1.583443284034729, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.859521746635437, + "num_tokens": 144691486.0, + "step": 3790 + }, + { + "epoch": 0.48225416613662386, + "grad_norm": 1.6014055013656616, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8489763140678406, + "num_tokens": 144729237.0, + "step": 3791 + }, + { + "epoch": 0.48238137641521434, + "grad_norm": 1.5298439264297485, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8613222241401672, + "num_tokens": 144773499.0, + "step": 3792 + }, + { + "epoch": 0.48250858669380486, + "grad_norm": 1.5347037315368652, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8581501245498657, + "num_tokens": 144814520.0, + "step": 3793 + }, + { + "epoch": 0.4826357969723954, + "grad_norm": 1.6606361865997314, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.86387038230896, + "num_tokens": 144848034.0, + "step": 3794 + }, + { + "epoch": 0.48276300725098586, + "grad_norm": 1.589236855506897, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8581371307373047, + "num_tokens": 144883929.0, + "step": 3795 + }, + { + "epoch": 0.4828902175295764, + "grad_norm": 1.5616328716278076, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8621875047683716, + "num_tokens": 144919146.0, + "step": 3796 + }, + { + "epoch": 0.4830174278081669, + "grad_norm": 1.7566407918930054, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8620909452438354, + "num_tokens": 144946112.0, + "step": 3797 + }, + { + "epoch": 0.4831446380867574, + "grad_norm": 1.4045937061309814, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8674467206001282, + "num_tokens": 144989842.0, + "step": 3798 + }, + { + "epoch": 0.4832718483653479, + "grad_norm": 1.4580888748168945, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8584742546081543, + "num_tokens": 145028902.0, + "step": 3799 + }, + { + "epoch": 0.48339905864393845, + "grad_norm": 1.5069069862365723, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8676055073738098, + "num_tokens": 145067242.0, + "step": 3800 + }, + { + "epoch": 0.4835262689225289, + "grad_norm": 1.4589884281158447, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8555853962898254, + "num_tokens": 145110183.0, + "step": 3801 + }, + { + "epoch": 0.48365347920111945, + "grad_norm": 1.371590256690979, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.879408597946167, + "num_tokens": 145148012.0, + "step": 3802 + }, + { + "epoch": 0.48378068947971, + "grad_norm": 1.3895374536514282, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8618058562278748, + "num_tokens": 145192724.0, + "step": 3803 + }, + { + "epoch": 0.48390789975830045, + "grad_norm": 1.434101939201355, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8767237663269043, + "num_tokens": 145229723.0, + "step": 3804 + }, + { + "epoch": 0.484035110036891, + "grad_norm": 1.3770779371261597, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8759678602218628, + "num_tokens": 145270027.0, + "step": 3805 + }, + { + "epoch": 0.4841623203154815, + "grad_norm": 1.6143593788146973, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8645502924919128, + "num_tokens": 145301138.0, + "step": 3806 + }, + { + "epoch": 0.484289530594072, + "grad_norm": 1.3971036672592163, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8632692098617554, + "num_tokens": 145347162.0, + "step": 3807 + }, + { + "epoch": 0.4844167408726625, + "grad_norm": 1.4140130281448364, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8725800514221191, + "num_tokens": 145387377.0, + "step": 3808 + }, + { + "epoch": 0.48454395115125304, + "grad_norm": 1.4983539581298828, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8692291975021362, + "num_tokens": 145427504.0, + "step": 3809 + }, + { + "epoch": 0.4846711614298435, + "grad_norm": 1.4674594402313232, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8721307516098022, + "num_tokens": 145465532.0, + "step": 3810 + }, + { + "epoch": 0.48479837170843404, + "grad_norm": 1.558212399482727, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8380753993988037, + "num_tokens": 145505477.0, + "step": 3811 + }, + { + "epoch": 0.48492558198702457, + "grad_norm": 1.6142946481704712, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8558799028396606, + "num_tokens": 145540437.0, + "step": 3812 + }, + { + "epoch": 0.48505279226561504, + "grad_norm": 1.474792242050171, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.867135226726532, + "num_tokens": 145579349.0, + "step": 3813 + }, + { + "epoch": 0.48518000254420557, + "grad_norm": 1.4906141757965088, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8638957738876343, + "num_tokens": 145616533.0, + "step": 3814 + }, + { + "epoch": 0.4853072128227961, + "grad_norm": 1.3979939222335815, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8734021186828613, + "num_tokens": 145655575.0, + "step": 3815 + }, + { + "epoch": 0.48543442310138657, + "grad_norm": 1.4495443105697632, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8775646090507507, + "num_tokens": 145691772.0, + "step": 3816 + }, + { + "epoch": 0.4855616333799771, + "grad_norm": 1.6012694835662842, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.849378764629364, + "num_tokens": 145726610.0, + "step": 3817 + }, + { + "epoch": 0.4856888436585676, + "grad_norm": 1.56582510471344, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8564823865890503, + "num_tokens": 145761563.0, + "step": 3818 + }, + { + "epoch": 0.4858160539371581, + "grad_norm": 1.5787159204483032, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8478689193725586, + "num_tokens": 145802913.0, + "step": 3819 + }, + { + "epoch": 0.4859432642157486, + "grad_norm": 1.523838996887207, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8610595464706421, + "num_tokens": 145841583.0, + "step": 3820 + }, + { + "epoch": 0.48607047449433916, + "grad_norm": 1.4551427364349365, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8656302690505981, + "num_tokens": 145883819.0, + "step": 3821 + }, + { + "epoch": 0.48619768477292963, + "grad_norm": 1.5273666381835938, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.842761218547821, + "num_tokens": 145926664.0, + "step": 3822 + }, + { + "epoch": 0.48632489505152016, + "grad_norm": 1.451829433441162, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8610864281654358, + "num_tokens": 145964525.0, + "step": 3823 + }, + { + "epoch": 0.4864521053301107, + "grad_norm": 1.402919054031372, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8663370609283447, + "num_tokens": 146005681.0, + "step": 3824 + }, + { + "epoch": 0.48657931560870116, + "grad_norm": 1.5871301889419556, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8649888038635254, + "num_tokens": 146041650.0, + "step": 3825 + }, + { + "epoch": 0.4867065258872917, + "grad_norm": 1.5296374559402466, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8751804828643799, + "num_tokens": 146076195.0, + "step": 3826 + }, + { + "epoch": 0.4868337361658822, + "grad_norm": 1.4422577619552612, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8757717609405518, + "num_tokens": 146115159.0, + "step": 3827 + }, + { + "epoch": 0.4869609464444727, + "grad_norm": 1.4484751224517822, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8826236724853516, + "num_tokens": 146151618.0, + "step": 3828 + }, + { + "epoch": 0.4870881567230632, + "grad_norm": 1.4540601968765259, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8730804920196533, + "num_tokens": 146191297.0, + "step": 3829 + }, + { + "epoch": 0.48721536700165374, + "grad_norm": 1.6676993370056152, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8606176376342773, + "num_tokens": 146223837.0, + "step": 3830 + }, + { + "epoch": 0.48734257728024427, + "grad_norm": 1.498844861984253, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8611500859260559, + "num_tokens": 146263546.0, + "step": 3831 + }, + { + "epoch": 0.48746978755883474, + "grad_norm": 1.5057055950164795, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8704166412353516, + "num_tokens": 146297925.0, + "step": 3832 + }, + { + "epoch": 0.48759699783742527, + "grad_norm": 1.4069833755493164, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8575892448425293, + "num_tokens": 146343334.0, + "step": 3833 + }, + { + "epoch": 0.4877242081160158, + "grad_norm": 1.54510498046875, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8666432499885559, + "num_tokens": 146380848.0, + "step": 3834 + }, + { + "epoch": 0.4878514183946063, + "grad_norm": 1.6145306825637817, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8495292067527771, + "num_tokens": 146416715.0, + "step": 3835 + }, + { + "epoch": 0.4879786286731968, + "grad_norm": 1.652233600616455, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8621986508369446, + "num_tokens": 146448000.0, + "step": 3836 + }, + { + "epoch": 0.48810583895178733, + "grad_norm": 1.595659852027893, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.881280779838562, + "num_tokens": 146483261.0, + "step": 3837 + }, + { + "epoch": 0.4882330492303778, + "grad_norm": 1.5589033365249634, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8479358553886414, + "num_tokens": 146521893.0, + "step": 3838 + }, + { + "epoch": 0.48836025950896833, + "grad_norm": 1.4506011009216309, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8655263781547546, + "num_tokens": 146562247.0, + "step": 3839 + }, + { + "epoch": 0.48848746978755886, + "grad_norm": 1.4209107160568237, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8598393201828003, + "num_tokens": 146603159.0, + "step": 3840 + }, + { + "epoch": 0.48861468006614933, + "grad_norm": 1.5069218873977661, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8764715790748596, + "num_tokens": 146640244.0, + "step": 3841 + }, + { + "epoch": 0.48874189034473986, + "grad_norm": 2.18959903717041, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.863933265209198, + "num_tokens": 146672029.0, + "step": 3842 + }, + { + "epoch": 0.4888691006233304, + "grad_norm": 1.564534306526184, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8662036657333374, + "num_tokens": 146712262.0, + "step": 3843 + }, + { + "epoch": 0.48899631090192086, + "grad_norm": 1.6038178205490112, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.859026312828064, + "num_tokens": 146745825.0, + "step": 3844 + }, + { + "epoch": 0.4891235211805114, + "grad_norm": 1.5432764291763306, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8705783486366272, + "num_tokens": 146783010.0, + "step": 3845 + }, + { + "epoch": 0.4892507314591019, + "grad_norm": 1.4794050455093384, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8651325702667236, + "num_tokens": 146821605.0, + "step": 3846 + }, + { + "epoch": 0.4893779417376924, + "grad_norm": 1.4427406787872314, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.85114586353302, + "num_tokens": 146863169.0, + "step": 3847 + }, + { + "epoch": 0.4895051520162829, + "grad_norm": 1.522132396697998, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.875249981880188, + "num_tokens": 146900715.0, + "step": 3848 + }, + { + "epoch": 0.48963236229487345, + "grad_norm": 1.5524017810821533, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8624480962753296, + "num_tokens": 146940976.0, + "step": 3849 + }, + { + "epoch": 0.4897595725734639, + "grad_norm": 1.6704655885696411, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8779851198196411, + "num_tokens": 146974228.0, + "step": 3850 + }, + { + "epoch": 0.48988678285205445, + "grad_norm": 1.5667810440063477, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.854963481426239, + "num_tokens": 147016319.0, + "step": 3851 + }, + { + "epoch": 0.490013993130645, + "grad_norm": 1.5248076915740967, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8689785003662109, + "num_tokens": 147052352.0, + "step": 3852 + }, + { + "epoch": 0.49014120340923545, + "grad_norm": 1.4237793684005737, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8693429827690125, + "num_tokens": 147093000.0, + "step": 3853 + }, + { + "epoch": 0.490268413687826, + "grad_norm": 1.6083191633224487, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8477530479431152, + "num_tokens": 147131119.0, + "step": 3854 + }, + { + "epoch": 0.4903956239664165, + "grad_norm": 1.440566897392273, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.867181658744812, + "num_tokens": 147170447.0, + "step": 3855 + }, + { + "epoch": 0.490522834245007, + "grad_norm": 1.5418434143066406, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.858270525932312, + "num_tokens": 147208207.0, + "step": 3856 + }, + { + "epoch": 0.4906500445235975, + "grad_norm": 1.4736762046813965, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8782472610473633, + "num_tokens": 147247753.0, + "step": 3857 + }, + { + "epoch": 0.49077725480218803, + "grad_norm": 1.5330625772476196, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8724982738494873, + "num_tokens": 147282034.0, + "step": 3858 + }, + { + "epoch": 0.4909044650807785, + "grad_norm": 1.6046671867370605, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8466609716415405, + "num_tokens": 147319354.0, + "step": 3859 + }, + { + "epoch": 0.49103167535936904, + "grad_norm": 1.547536015510559, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.87682044506073, + "num_tokens": 147353545.0, + "step": 3860 + }, + { + "epoch": 0.49115888563795956, + "grad_norm": 1.4977532625198364, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8486599326133728, + "num_tokens": 147394814.0, + "step": 3861 + }, + { + "epoch": 0.49128609591655004, + "grad_norm": 1.5954722166061401, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.844801664352417, + "num_tokens": 147429851.0, + "step": 3862 + }, + { + "epoch": 0.49141330619514056, + "grad_norm": 1.6168136596679688, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8566839694976807, + "num_tokens": 147471636.0, + "step": 3863 + }, + { + "epoch": 0.4915405164737311, + "grad_norm": 1.5926225185394287, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8600468635559082, + "num_tokens": 147508619.0, + "step": 3864 + }, + { + "epoch": 0.49166772675232157, + "grad_norm": 1.4687941074371338, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8741627931594849, + "num_tokens": 147549093.0, + "step": 3865 + }, + { + "epoch": 0.4917949370309121, + "grad_norm": 1.5293707847595215, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8549917340278625, + "num_tokens": 147586772.0, + "step": 3866 + }, + { + "epoch": 0.4919221473095026, + "grad_norm": 1.5297715663909912, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8734650611877441, + "num_tokens": 147624680.0, + "step": 3867 + }, + { + "epoch": 0.4920493575880931, + "grad_norm": 1.367861270904541, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8663811087608337, + "num_tokens": 147668461.0, + "step": 3868 + }, + { + "epoch": 0.4921765678666836, + "grad_norm": 1.5635058879852295, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8654588460922241, + "num_tokens": 147708140.0, + "step": 3869 + }, + { + "epoch": 0.49230377814527415, + "grad_norm": 1.5232912302017212, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8560941219329834, + "num_tokens": 147748500.0, + "step": 3870 + }, + { + "epoch": 0.4924309884238646, + "grad_norm": 1.5864614248275757, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.861074686050415, + "num_tokens": 147782396.0, + "step": 3871 + }, + { + "epoch": 0.49255819870245515, + "grad_norm": 1.6021900177001953, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8532156944274902, + "num_tokens": 147819280.0, + "step": 3872 + }, + { + "epoch": 0.4926854089810457, + "grad_norm": 1.439755916595459, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8605161905288696, + "num_tokens": 147860099.0, + "step": 3873 + }, + { + "epoch": 0.49281261925963615, + "grad_norm": 1.41498601436615, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8672993779182434, + "num_tokens": 147904197.0, + "step": 3874 + }, + { + "epoch": 0.4929398295382267, + "grad_norm": 1.5014173984527588, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8610857725143433, + "num_tokens": 147940586.0, + "step": 3875 + }, + { + "epoch": 0.4930670398168172, + "grad_norm": 1.6008093357086182, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8689568042755127, + "num_tokens": 147977785.0, + "step": 3876 + }, + { + "epoch": 0.4931942500954077, + "grad_norm": 1.5409212112426758, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8461334109306335, + "num_tokens": 148017034.0, + "step": 3877 + }, + { + "epoch": 0.4933214603739982, + "grad_norm": 1.5279240608215332, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8770809173583984, + "num_tokens": 148053967.0, + "step": 3878 + }, + { + "epoch": 0.49344867065258874, + "grad_norm": 1.4688118696212769, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8691023588180542, + "num_tokens": 148091991.0, + "step": 3879 + }, + { + "epoch": 0.4935758809311792, + "grad_norm": 1.6102815866470337, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8529603481292725, + "num_tokens": 148132512.0, + "step": 3880 + }, + { + "epoch": 0.49370309120976974, + "grad_norm": 1.5146452188491821, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8530017137527466, + "num_tokens": 148171183.0, + "step": 3881 + }, + { + "epoch": 0.49383030148836027, + "grad_norm": 1.6857212781906128, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8508727550506592, + "num_tokens": 148202746.0, + "step": 3882 + }, + { + "epoch": 0.4939575117669508, + "grad_norm": 1.678600549697876, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8587954640388489, + "num_tokens": 148236152.0, + "step": 3883 + }, + { + "epoch": 0.49408472204554127, + "grad_norm": 1.372045874595642, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8631700277328491, + "num_tokens": 148280558.0, + "step": 3884 + }, + { + "epoch": 0.4942119323241318, + "grad_norm": 1.797177791595459, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8534812927246094, + "num_tokens": 148311480.0, + "step": 3885 + }, + { + "epoch": 0.4943391426027223, + "grad_norm": 1.4857878684997559, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8507591485977173, + "num_tokens": 148354564.0, + "step": 3886 + }, + { + "epoch": 0.4944663528813128, + "grad_norm": 1.6290628910064697, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.872054398059845, + "num_tokens": 148387901.0, + "step": 3887 + }, + { + "epoch": 0.4945935631599033, + "grad_norm": 1.5277992486953735, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8683847188949585, + "num_tokens": 148424783.0, + "step": 3888 + }, + { + "epoch": 0.49472077343849385, + "grad_norm": 1.407219409942627, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.863345205783844, + "num_tokens": 148465250.0, + "step": 3889 + }, + { + "epoch": 0.4948479837170843, + "grad_norm": 1.6337114572525024, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.860436201095581, + "num_tokens": 148498822.0, + "step": 3890 + }, + { + "epoch": 0.49497519399567486, + "grad_norm": 1.504638910293579, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8744126558303833, + "num_tokens": 148536986.0, + "step": 3891 + }, + { + "epoch": 0.4951024042742654, + "grad_norm": 1.498558759689331, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8672696948051453, + "num_tokens": 148576618.0, + "step": 3892 + }, + { + "epoch": 0.49522961455285586, + "grad_norm": 1.491792917251587, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8601447343826294, + "num_tokens": 148616971.0, + "step": 3893 + }, + { + "epoch": 0.4953568248314464, + "grad_norm": 1.8506615161895752, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8682737350463867, + "num_tokens": 148649013.0, + "step": 3894 + }, + { + "epoch": 0.4954840351100369, + "grad_norm": 1.58609139919281, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8703222274780273, + "num_tokens": 148682592.0, + "step": 3895 + }, + { + "epoch": 0.4956112453886274, + "grad_norm": 1.4032031297683716, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8809559345245361, + "num_tokens": 148718673.0, + "step": 3896 + }, + { + "epoch": 0.4957384556672179, + "grad_norm": 1.5714715719223022, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.874754786491394, + "num_tokens": 148752201.0, + "step": 3897 + }, + { + "epoch": 0.49586566594580844, + "grad_norm": 1.5794497728347778, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.869534969329834, + "num_tokens": 148790926.0, + "step": 3898 + }, + { + "epoch": 0.4959928762243989, + "grad_norm": 1.455399513244629, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8583911061286926, + "num_tokens": 148833368.0, + "step": 3899 + }, + { + "epoch": 0.49612008650298944, + "grad_norm": 1.3923890590667725, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8676940202713013, + "num_tokens": 148874497.0, + "step": 3900 + }, + { + "epoch": 0.49624729678157997, + "grad_norm": 1.6035388708114624, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8529395461082458, + "num_tokens": 148916557.0, + "step": 3901 + }, + { + "epoch": 0.49637450706017044, + "grad_norm": 1.6886019706726074, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8597806692123413, + "num_tokens": 148952144.0, + "step": 3902 + }, + { + "epoch": 0.496501717338761, + "grad_norm": 1.4690593481063843, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8533805012702942, + "num_tokens": 148994164.0, + "step": 3903 + }, + { + "epoch": 0.4966289276173515, + "grad_norm": 1.4703900814056396, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8649419546127319, + "num_tokens": 149033066.0, + "step": 3904 + }, + { + "epoch": 0.496756137895942, + "grad_norm": 1.4272841215133667, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8382452130317688, + "num_tokens": 149079116.0, + "step": 3905 + }, + { + "epoch": 0.4968833481745325, + "grad_norm": 1.6115942001342773, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.874404788017273, + "num_tokens": 149110697.0, + "step": 3906 + }, + { + "epoch": 0.49701055845312303, + "grad_norm": 1.5294796228408813, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8501887321472168, + "num_tokens": 149150269.0, + "step": 3907 + }, + { + "epoch": 0.4971377687317135, + "grad_norm": 1.554214358329773, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8753388524055481, + "num_tokens": 149184259.0, + "step": 3908 + }, + { + "epoch": 0.49726497901030403, + "grad_norm": 1.6088241338729858, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8467712998390198, + "num_tokens": 149222573.0, + "step": 3909 + }, + { + "epoch": 0.49739218928889456, + "grad_norm": 1.6077865362167358, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8611096143722534, + "num_tokens": 149259047.0, + "step": 3910 + }, + { + "epoch": 0.49751939956748503, + "grad_norm": 1.514595866203308, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.856998085975647, + "num_tokens": 149295006.0, + "step": 3911 + }, + { + "epoch": 0.49764660984607556, + "grad_norm": 1.599705457687378, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8660653829574585, + "num_tokens": 149327654.0, + "step": 3912 + }, + { + "epoch": 0.4977738201246661, + "grad_norm": 1.5102304220199585, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8747687339782715, + "num_tokens": 149366201.0, + "step": 3913 + }, + { + "epoch": 0.49790103040325656, + "grad_norm": 1.6735236644744873, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8550894260406494, + "num_tokens": 149400982.0, + "step": 3914 + }, + { + "epoch": 0.4980282406818471, + "grad_norm": 1.418150544166565, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8600391745567322, + "num_tokens": 149440689.0, + "step": 3915 + }, + { + "epoch": 0.4981554509604376, + "grad_norm": 1.4917186498641968, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8636164665222168, + "num_tokens": 149479775.0, + "step": 3916 + }, + { + "epoch": 0.4982826612390281, + "grad_norm": 1.5161858797073364, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8713275194168091, + "num_tokens": 149515477.0, + "step": 3917 + }, + { + "epoch": 0.4984098715176186, + "grad_norm": 1.5406070947647095, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.882903516292572, + "num_tokens": 149550319.0, + "step": 3918 + }, + { + "epoch": 0.49853708179620915, + "grad_norm": 1.4900264739990234, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8585598468780518, + "num_tokens": 149588237.0, + "step": 3919 + }, + { + "epoch": 0.4986642920747996, + "grad_norm": 1.4472074508666992, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8743025064468384, + "num_tokens": 149627015.0, + "step": 3920 + }, + { + "epoch": 0.49879150235339015, + "grad_norm": 1.578363060951233, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8596223592758179, + "num_tokens": 149661540.0, + "step": 3921 + }, + { + "epoch": 0.4989187126319807, + "grad_norm": 1.554031491279602, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.862073540687561, + "num_tokens": 149698532.0, + "step": 3922 + }, + { + "epoch": 0.49904592291057115, + "grad_norm": 1.5112859010696411, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8453384637832642, + "num_tokens": 149739082.0, + "step": 3923 + }, + { + "epoch": 0.4991731331891617, + "grad_norm": 1.5090917348861694, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8498616218566895, + "num_tokens": 149780034.0, + "step": 3924 + }, + { + "epoch": 0.4993003434677522, + "grad_norm": 1.4295847415924072, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.856253981590271, + "num_tokens": 149819707.0, + "step": 3925 + }, + { + "epoch": 0.4994275537463427, + "grad_norm": 1.5784786939620972, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.857359766960144, + "num_tokens": 149860802.0, + "step": 3926 + }, + { + "epoch": 0.4995547640249332, + "grad_norm": 1.400362491607666, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8651763200759888, + "num_tokens": 149903554.0, + "step": 3927 + }, + { + "epoch": 0.49968197430352373, + "grad_norm": 1.4589447975158691, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8617261648178101, + "num_tokens": 149944313.0, + "step": 3928 + }, + { + "epoch": 0.4998091845821142, + "grad_norm": 1.5799998044967651, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8698623180389404, + "num_tokens": 149978190.0, + "step": 3929 + }, + { + "epoch": 0.49993639486070474, + "grad_norm": 1.606732726097107, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8468834161758423, + "num_tokens": 150017505.0, + "step": 3930 + }, + { + "epoch": 0.5000636051392953, + "grad_norm": 1.5056298971176147, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8629639148712158, + "num_tokens": 150059192.0, + "step": 3931 + }, + { + "epoch": 0.5001908154178858, + "grad_norm": 1.4936596155166626, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8606562614440918, + "num_tokens": 150099889.0, + "step": 3932 + }, + { + "epoch": 0.5003180256964763, + "grad_norm": 1.5320669412612915, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8596222996711731, + "num_tokens": 150139251.0, + "step": 3933 + }, + { + "epoch": 0.5004452359750667, + "grad_norm": 1.548704981803894, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8480894565582275, + "num_tokens": 150181983.0, + "step": 3934 + }, + { + "epoch": 0.5005724462536573, + "grad_norm": 1.4377975463867188, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8798530697822571, + "num_tokens": 150219948.0, + "step": 3935 + }, + { + "epoch": 0.5006996565322478, + "grad_norm": 1.5404711961746216, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8737276196479797, + "num_tokens": 150255630.0, + "step": 3936 + }, + { + "epoch": 0.5008268668108383, + "grad_norm": 1.6647449731826782, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8474236726760864, + "num_tokens": 150289141.0, + "step": 3937 + }, + { + "epoch": 0.5009540770894289, + "grad_norm": 1.6148611307144165, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8658204078674316, + "num_tokens": 150324818.0, + "step": 3938 + }, + { + "epoch": 0.5010812873680194, + "grad_norm": 1.6502050161361694, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.855344831943512, + "num_tokens": 150358558.0, + "step": 3939 + }, + { + "epoch": 0.5012084976466098, + "grad_norm": 1.4746406078338623, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8617775440216064, + "num_tokens": 150399808.0, + "step": 3940 + }, + { + "epoch": 0.5013357079252003, + "grad_norm": 1.5505908727645874, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8540598750114441, + "num_tokens": 150438652.0, + "step": 3941 + }, + { + "epoch": 0.5014629182037909, + "grad_norm": 1.4746931791305542, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8624809980392456, + "num_tokens": 150482286.0, + "step": 3942 + }, + { + "epoch": 0.5015901284823814, + "grad_norm": 1.615635871887207, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8411169052124023, + "num_tokens": 150524640.0, + "step": 3943 + }, + { + "epoch": 0.5017173387609719, + "grad_norm": 1.5003048181533813, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8660328388214111, + "num_tokens": 150563920.0, + "step": 3944 + }, + { + "epoch": 0.5018445490395624, + "grad_norm": 1.6038906574249268, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8423651456832886, + "num_tokens": 150602712.0, + "step": 3945 + }, + { + "epoch": 0.5019717593181529, + "grad_norm": 1.4099916219711304, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8739572763442993, + "num_tokens": 150644190.0, + "step": 3946 + }, + { + "epoch": 0.5020989695967434, + "grad_norm": 1.6495288610458374, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8701881170272827, + "num_tokens": 150674369.0, + "step": 3947 + }, + { + "epoch": 0.5022261798753339, + "grad_norm": 1.4326319694519043, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8650599718093872, + "num_tokens": 150715530.0, + "step": 3948 + }, + { + "epoch": 0.5023533901539244, + "grad_norm": 1.6160774230957031, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8653870820999146, + "num_tokens": 150747896.0, + "step": 3949 + }, + { + "epoch": 0.502480600432515, + "grad_norm": 1.5033981800079346, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8577722311019897, + "num_tokens": 150786419.0, + "step": 3950 + }, + { + "epoch": 0.5026078107111055, + "grad_norm": 1.5972121953964233, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8499997854232788, + "num_tokens": 150819350.0, + "step": 3951 + }, + { + "epoch": 0.5027350209896959, + "grad_norm": 1.7648600339889526, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8539220690727234, + "num_tokens": 150849879.0, + "step": 3952 + }, + { + "epoch": 0.5028622312682864, + "grad_norm": 1.515446424484253, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8646718263626099, + "num_tokens": 150890653.0, + "step": 3953 + }, + { + "epoch": 0.502989441546877, + "grad_norm": 1.5230644941329956, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8668022155761719, + "num_tokens": 150925708.0, + "step": 3954 + }, + { + "epoch": 0.5031166518254675, + "grad_norm": 1.4605966806411743, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8646658658981323, + "num_tokens": 150967402.0, + "step": 3955 + }, + { + "epoch": 0.503243862104058, + "grad_norm": 1.5775787830352783, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8633779287338257, + "num_tokens": 151005814.0, + "step": 3956 + }, + { + "epoch": 0.5033710723826486, + "grad_norm": 1.38939368724823, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8794856071472168, + "num_tokens": 151050097.0, + "step": 3957 + }, + { + "epoch": 0.5034982826612391, + "grad_norm": 1.6689324378967285, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8698381185531616, + "num_tokens": 151079239.0, + "step": 3958 + }, + { + "epoch": 0.5036254929398295, + "grad_norm": 1.4630013704299927, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8517038822174072, + "num_tokens": 151119588.0, + "step": 3959 + }, + { + "epoch": 0.50375270321842, + "grad_norm": 1.657874345779419, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8455038666725159, + "num_tokens": 151154382.0, + "step": 3960 + }, + { + "epoch": 0.5038799134970106, + "grad_norm": 1.4422638416290283, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8597997426986694, + "num_tokens": 151196461.0, + "step": 3961 + }, + { + "epoch": 0.5040071237756011, + "grad_norm": 1.5938581228256226, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8703272342681885, + "num_tokens": 151234058.0, + "step": 3962 + }, + { + "epoch": 0.5041343340541916, + "grad_norm": 1.5430806875228882, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8557227253913879, + "num_tokens": 151270278.0, + "step": 3963 + }, + { + "epoch": 0.5042615443327821, + "grad_norm": 1.4813686609268188, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8577393293380737, + "num_tokens": 151310966.0, + "step": 3964 + }, + { + "epoch": 0.5043887546113726, + "grad_norm": 1.6173399686813354, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8524736762046814, + "num_tokens": 151349864.0, + "step": 3965 + }, + { + "epoch": 0.5045159648899631, + "grad_norm": 1.4830564260482788, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8582088947296143, + "num_tokens": 151389465.0, + "step": 3966 + }, + { + "epoch": 0.5046431751685536, + "grad_norm": 1.365379810333252, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8836095929145813, + "num_tokens": 151430492.0, + "step": 3967 + }, + { + "epoch": 0.5047703854471441, + "grad_norm": 1.3952664136886597, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8625212907791138, + "num_tokens": 151473635.0, + "step": 3968 + }, + { + "epoch": 0.5048975957257347, + "grad_norm": 1.4881128072738647, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8687869310379028, + "num_tokens": 151509330.0, + "step": 3969 + }, + { + "epoch": 0.5050248060043252, + "grad_norm": 1.7213726043701172, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8632133603096008, + "num_tokens": 151541480.0, + "step": 3970 + }, + { + "epoch": 0.5051520162829156, + "grad_norm": 1.618129014968872, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8717420697212219, + "num_tokens": 151576787.0, + "step": 3971 + }, + { + "epoch": 0.5052792265615061, + "grad_norm": 1.4506686925888062, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8815144300460815, + "num_tokens": 151616753.0, + "step": 3972 + }, + { + "epoch": 0.5054064368400967, + "grad_norm": 1.6566057205200195, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8457135558128357, + "num_tokens": 151655313.0, + "step": 3973 + }, + { + "epoch": 0.5055336471186872, + "grad_norm": 1.5843491554260254, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8494082689285278, + "num_tokens": 151692992.0, + "step": 3974 + }, + { + "epoch": 0.5056608573972777, + "grad_norm": 1.4461729526519775, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8666455149650574, + "num_tokens": 151732305.0, + "step": 3975 + }, + { + "epoch": 0.5057880676758683, + "grad_norm": 1.6711822748184204, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8623970746994019, + "num_tokens": 151768167.0, + "step": 3976 + }, + { + "epoch": 0.5059152779544587, + "grad_norm": 1.642492651939392, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8613418340682983, + "num_tokens": 151802016.0, + "step": 3977 + }, + { + "epoch": 0.5060424882330492, + "grad_norm": 1.468788504600525, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8705790042877197, + "num_tokens": 151841015.0, + "step": 3978 + }, + { + "epoch": 0.5061696985116397, + "grad_norm": 1.453784704208374, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8495023250579834, + "num_tokens": 151884185.0, + "step": 3979 + }, + { + "epoch": 0.5062969087902303, + "grad_norm": 1.541831374168396, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8528659343719482, + "num_tokens": 151923127.0, + "step": 3980 + }, + { + "epoch": 0.5064241190688208, + "grad_norm": 1.5509400367736816, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8588414788246155, + "num_tokens": 151963891.0, + "step": 3981 + }, + { + "epoch": 0.5065513293474113, + "grad_norm": 1.601253628730774, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8504965901374817, + "num_tokens": 152001691.0, + "step": 3982 + }, + { + "epoch": 0.5066785396260017, + "grad_norm": 1.472112774848938, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8718534708023071, + "num_tokens": 152041117.0, + "step": 3983 + }, + { + "epoch": 0.5068057499045923, + "grad_norm": 1.470192790031433, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8628879189491272, + "num_tokens": 152081064.0, + "step": 3984 + }, + { + "epoch": 0.5069329601831828, + "grad_norm": 1.4243875741958618, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8619338274002075, + "num_tokens": 152122378.0, + "step": 3985 + }, + { + "epoch": 0.5070601704617733, + "grad_norm": 1.4555585384368896, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8679949045181274, + "num_tokens": 152161017.0, + "step": 3986 + }, + { + "epoch": 0.5071873807403638, + "grad_norm": 1.523247241973877, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8635587692260742, + "num_tokens": 152201073.0, + "step": 3987 + }, + { + "epoch": 0.5073145910189544, + "grad_norm": 1.492205262184143, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8491233587265015, + "num_tokens": 152243140.0, + "step": 3988 + }, + { + "epoch": 0.5074418012975448, + "grad_norm": 1.47824227809906, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8662798404693604, + "num_tokens": 152282848.0, + "step": 3989 + }, + { + "epoch": 0.5075690115761353, + "grad_norm": 1.481414556503296, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.853577733039856, + "num_tokens": 152323676.0, + "step": 3990 + }, + { + "epoch": 0.5076962218547258, + "grad_norm": 1.4628498554229736, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8664226531982422, + "num_tokens": 152359692.0, + "step": 3991 + }, + { + "epoch": 0.5078234321333164, + "grad_norm": 1.5267632007598877, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8715236783027649, + "num_tokens": 152394727.0, + "step": 3992 + }, + { + "epoch": 0.5079506424119069, + "grad_norm": 1.5141929388046265, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8759323954582214, + "num_tokens": 152430888.0, + "step": 3993 + }, + { + "epoch": 0.5080778526904974, + "grad_norm": 1.3628133535385132, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8592362403869629, + "num_tokens": 152476688.0, + "step": 3994 + }, + { + "epoch": 0.5082050629690879, + "grad_norm": 1.535911202430725, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.874310314655304, + "num_tokens": 152514415.0, + "step": 3995 + }, + { + "epoch": 0.5083322732476784, + "grad_norm": 1.439881682395935, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8725765943527222, + "num_tokens": 152552255.0, + "step": 3996 + }, + { + "epoch": 0.5084594835262689, + "grad_norm": 1.5712538957595825, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8571281433105469, + "num_tokens": 152590632.0, + "step": 3997 + }, + { + "epoch": 0.5085866938048594, + "grad_norm": 1.5783729553222656, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8679683804512024, + "num_tokens": 152622945.0, + "step": 3998 + }, + { + "epoch": 0.50871390408345, + "grad_norm": 1.5313844680786133, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8575603365898132, + "num_tokens": 152660793.0, + "step": 3999 + }, + { + "epoch": 0.5088411143620405, + "grad_norm": 1.5810500383377075, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8747979402542114, + "num_tokens": 152693929.0, + "step": 4000 + }, + { + "epoch": 0.5089683246406309, + "grad_norm": 1.5926501750946045, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8690221309661865, + "num_tokens": 152729492.0, + "step": 4001 + }, + { + "epoch": 0.5090955349192214, + "grad_norm": 1.5712300539016724, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8652237057685852, + "num_tokens": 152765898.0, + "step": 4002 + }, + { + "epoch": 0.509222745197812, + "grad_norm": 1.494280457496643, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8487315773963928, + "num_tokens": 152805059.0, + "step": 4003 + }, + { + "epoch": 0.5093499554764025, + "grad_norm": 1.492396354675293, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.864492654800415, + "num_tokens": 152843696.0, + "step": 4004 + }, + { + "epoch": 0.509477165754993, + "grad_norm": 1.4520223140716553, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8609758615493774, + "num_tokens": 152884986.0, + "step": 4005 + }, + { + "epoch": 0.5096043760335836, + "grad_norm": 1.4129537343978882, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8595378398895264, + "num_tokens": 152925945.0, + "step": 4006 + }, + { + "epoch": 0.5097315863121741, + "grad_norm": 1.4467089176177979, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8522213101387024, + "num_tokens": 152972120.0, + "step": 4007 + }, + { + "epoch": 0.5098587965907645, + "grad_norm": 1.4332207441329956, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8547180891036987, + "num_tokens": 153012507.0, + "step": 4008 + }, + { + "epoch": 0.509986006869355, + "grad_norm": 1.4604597091674805, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8687404990196228, + "num_tokens": 153051643.0, + "step": 4009 + }, + { + "epoch": 0.5101132171479456, + "grad_norm": 1.5021374225616455, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8674010038375854, + "num_tokens": 153089150.0, + "step": 4010 + }, + { + "epoch": 0.5102404274265361, + "grad_norm": 1.4681353569030762, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8646442890167236, + "num_tokens": 153130048.0, + "step": 4011 + }, + { + "epoch": 0.5103676377051266, + "grad_norm": 1.6690090894699097, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8666208386421204, + "num_tokens": 153167344.0, + "step": 4012 + }, + { + "epoch": 0.5104948479837171, + "grad_norm": 1.6320247650146484, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8441861271858215, + "num_tokens": 153206055.0, + "step": 4013 + }, + { + "epoch": 0.5106220582623076, + "grad_norm": 1.7182657718658447, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8704967498779297, + "num_tokens": 153240573.0, + "step": 4014 + }, + { + "epoch": 0.5107492685408981, + "grad_norm": 1.561936855316162, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.859329104423523, + "num_tokens": 153281041.0, + "step": 4015 + }, + { + "epoch": 0.5108764788194886, + "grad_norm": 1.5622633695602417, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8688160181045532, + "num_tokens": 153319615.0, + "step": 4016 + }, + { + "epoch": 0.5110036890980791, + "grad_norm": 1.5093799829483032, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8611631393432617, + "num_tokens": 153355570.0, + "step": 4017 + }, + { + "epoch": 0.5111308993766697, + "grad_norm": 1.493735432624817, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8711774945259094, + "num_tokens": 153396242.0, + "step": 4018 + }, + { + "epoch": 0.5112581096552602, + "grad_norm": 1.6823982000350952, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8534291386604309, + "num_tokens": 153432476.0, + "step": 4019 + }, + { + "epoch": 0.5113853199338506, + "grad_norm": 1.6420639753341675, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8781159520149231, + "num_tokens": 153466196.0, + "step": 4020 + }, + { + "epoch": 0.5115125302124411, + "grad_norm": 1.4688087701797485, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8621050119400024, + "num_tokens": 153507406.0, + "step": 4021 + }, + { + "epoch": 0.5116397404910317, + "grad_norm": 1.5143333673477173, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8658982515335083, + "num_tokens": 153542931.0, + "step": 4022 + }, + { + "epoch": 0.5117669507696222, + "grad_norm": 1.5126687288284302, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8473973274230957, + "num_tokens": 153581904.0, + "step": 4023 + }, + { + "epoch": 0.5118941610482127, + "grad_norm": 1.5498158931732178, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8568938970565796, + "num_tokens": 153623607.0, + "step": 4024 + }, + { + "epoch": 0.5120213713268033, + "grad_norm": 1.5791751146316528, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8665341138839722, + "num_tokens": 153660686.0, + "step": 4025 + }, + { + "epoch": 0.5121485816053937, + "grad_norm": 1.6803478002548218, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8743278384208679, + "num_tokens": 153692194.0, + "step": 4026 + }, + { + "epoch": 0.5122757918839842, + "grad_norm": 1.454209804534912, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8635783195495605, + "num_tokens": 153736962.0, + "step": 4027 + }, + { + "epoch": 0.5124030021625747, + "grad_norm": 1.4000657796859741, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8846660256385803, + "num_tokens": 153773802.0, + "step": 4028 + }, + { + "epoch": 0.5125302124411653, + "grad_norm": 1.538123369216919, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8658628463745117, + "num_tokens": 153812645.0, + "step": 4029 + }, + { + "epoch": 0.5126574227197558, + "grad_norm": 1.706857681274414, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8658851385116577, + "num_tokens": 153847934.0, + "step": 4030 + }, + { + "epoch": 0.5127846329983463, + "grad_norm": 1.4851797819137573, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.862824559211731, + "num_tokens": 153887770.0, + "step": 4031 + }, + { + "epoch": 0.5129118432769367, + "grad_norm": 1.6658165454864502, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8702232837677002, + "num_tokens": 153920330.0, + "step": 4032 + }, + { + "epoch": 0.5130390535555273, + "grad_norm": 1.5454061031341553, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.871842086315155, + "num_tokens": 153957552.0, + "step": 4033 + }, + { + "epoch": 0.5131662638341178, + "grad_norm": 1.5994378328323364, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8712241649627686, + "num_tokens": 153990602.0, + "step": 4034 + }, + { + "epoch": 0.5132934741127083, + "grad_norm": 1.502231240272522, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8781127333641052, + "num_tokens": 154029048.0, + "step": 4035 + }, + { + "epoch": 0.5134206843912988, + "grad_norm": 1.560495376586914, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8723075985908508, + "num_tokens": 154064304.0, + "step": 4036 + }, + { + "epoch": 0.5135478946698894, + "grad_norm": 1.4187126159667969, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8677178621292114, + "num_tokens": 154106304.0, + "step": 4037 + }, + { + "epoch": 0.5136751049484798, + "grad_norm": 1.3017090559005737, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8684155941009521, + "num_tokens": 154152333.0, + "step": 4038 + }, + { + "epoch": 0.5138023152270703, + "grad_norm": 1.489551067352295, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8537352085113525, + "num_tokens": 154195278.0, + "step": 4039 + }, + { + "epoch": 0.5139295255056608, + "grad_norm": 1.537810206413269, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.852267861366272, + "num_tokens": 154239295.0, + "step": 4040 + }, + { + "epoch": 0.5140567357842514, + "grad_norm": 1.571498990058899, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8719996213912964, + "num_tokens": 154277675.0, + "step": 4041 + }, + { + "epoch": 0.5141839460628419, + "grad_norm": 1.6087195873260498, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8567702770233154, + "num_tokens": 154314124.0, + "step": 4042 + }, + { + "epoch": 0.5143111563414324, + "grad_norm": 1.5826163291931152, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.849387526512146, + "num_tokens": 154350857.0, + "step": 4043 + }, + { + "epoch": 0.5144383666200228, + "grad_norm": 1.6190438270568848, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8712788224220276, + "num_tokens": 154387736.0, + "step": 4044 + }, + { + "epoch": 0.5145655768986134, + "grad_norm": 1.5983856916427612, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8579460382461548, + "num_tokens": 154421669.0, + "step": 4045 + }, + { + "epoch": 0.5146927871772039, + "grad_norm": 1.4963960647583008, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.870958685874939, + "num_tokens": 154459163.0, + "step": 4046 + }, + { + "epoch": 0.5148199974557944, + "grad_norm": 1.6076995134353638, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8516345024108887, + "num_tokens": 154496810.0, + "step": 4047 + }, + { + "epoch": 0.514947207734385, + "grad_norm": 1.5664355754852295, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8532339334487915, + "num_tokens": 154533856.0, + "step": 4048 + }, + { + "epoch": 0.5150744180129755, + "grad_norm": 1.6500539779663086, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.862750768661499, + "num_tokens": 154568705.0, + "step": 4049 + }, + { + "epoch": 0.5152016282915659, + "grad_norm": 1.5940395593643188, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8600215911865234, + "num_tokens": 154603875.0, + "step": 4050 + }, + { + "epoch": 0.5153288385701564, + "grad_norm": 1.5279202461242676, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8555943965911865, + "num_tokens": 154648481.0, + "step": 4051 + }, + { + "epoch": 0.515456048848747, + "grad_norm": 1.5108295679092407, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8734603524208069, + "num_tokens": 154686960.0, + "step": 4052 + }, + { + "epoch": 0.5155832591273375, + "grad_norm": 1.6617859601974487, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8773807883262634, + "num_tokens": 154721340.0, + "step": 4053 + }, + { + "epoch": 0.515710469405928, + "grad_norm": 1.7175654172897339, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8580137491226196, + "num_tokens": 154753554.0, + "step": 4054 + }, + { + "epoch": 0.5158376796845185, + "grad_norm": 1.469401240348816, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8558007478713989, + "num_tokens": 154795159.0, + "step": 4055 + }, + { + "epoch": 0.5159648899631091, + "grad_norm": 1.3828434944152832, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8572446703910828, + "num_tokens": 154839905.0, + "step": 4056 + }, + { + "epoch": 0.5160921002416995, + "grad_norm": 1.4905632734298706, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8700345158576965, + "num_tokens": 154878826.0, + "step": 4057 + }, + { + "epoch": 0.51621931052029, + "grad_norm": 1.4864083528518677, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8601779937744141, + "num_tokens": 154918968.0, + "step": 4058 + }, + { + "epoch": 0.5163465207988805, + "grad_norm": 1.6172587871551514, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8573340773582458, + "num_tokens": 154955058.0, + "step": 4059 + }, + { + "epoch": 0.5164737310774711, + "grad_norm": 1.468469262123108, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8760108351707458, + "num_tokens": 154993700.0, + "step": 4060 + }, + { + "epoch": 0.5166009413560616, + "grad_norm": 1.7086801528930664, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.85332852602005, + "num_tokens": 155031445.0, + "step": 4061 + }, + { + "epoch": 0.5167281516346521, + "grad_norm": 1.5219430923461914, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8697285652160645, + "num_tokens": 155066551.0, + "step": 4062 + }, + { + "epoch": 0.5168553619132426, + "grad_norm": 1.5401052236557007, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8667418956756592, + "num_tokens": 155106465.0, + "step": 4063 + }, + { + "epoch": 0.5169825721918331, + "grad_norm": 1.5472970008850098, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8624284863471985, + "num_tokens": 155143543.0, + "step": 4064 + }, + { + "epoch": 0.5171097824704236, + "grad_norm": 1.4517109394073486, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8632917404174805, + "num_tokens": 155185881.0, + "step": 4065 + }, + { + "epoch": 0.5172369927490141, + "grad_norm": 1.5742321014404297, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8714637160301208, + "num_tokens": 155222147.0, + "step": 4066 + }, + { + "epoch": 0.5173642030276047, + "grad_norm": 1.4779350757598877, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8659270405769348, + "num_tokens": 155262657.0, + "step": 4067 + }, + { + "epoch": 0.5174914133061952, + "grad_norm": 1.6381181478500366, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8585443496704102, + "num_tokens": 155298813.0, + "step": 4068 + }, + { + "epoch": 0.5176186235847856, + "grad_norm": 1.4218918085098267, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8775954246520996, + "num_tokens": 155339036.0, + "step": 4069 + }, + { + "epoch": 0.5177458338633761, + "grad_norm": 1.5347572565078735, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8599725961685181, + "num_tokens": 155376544.0, + "step": 4070 + }, + { + "epoch": 0.5178730441419667, + "grad_norm": 1.4317315816879272, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8707756400108337, + "num_tokens": 155414539.0, + "step": 4071 + }, + { + "epoch": 0.5180002544205572, + "grad_norm": 1.5653377771377563, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8584520816802979, + "num_tokens": 155449923.0, + "step": 4072 + }, + { + "epoch": 0.5181274646991477, + "grad_norm": 1.5431712865829468, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8634032011032104, + "num_tokens": 155488623.0, + "step": 4073 + }, + { + "epoch": 0.5182546749777382, + "grad_norm": 1.393488883972168, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8766610622406006, + "num_tokens": 155530893.0, + "step": 4074 + }, + { + "epoch": 0.5183818852563287, + "grad_norm": 1.5638681650161743, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.850581705570221, + "num_tokens": 155571637.0, + "step": 4075 + }, + { + "epoch": 0.5185090955349192, + "grad_norm": 1.5940803289413452, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8530871868133545, + "num_tokens": 155606646.0, + "step": 4076 + }, + { + "epoch": 0.5186363058135097, + "grad_norm": 1.4748072624206543, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8533669710159302, + "num_tokens": 155654106.0, + "step": 4077 + }, + { + "epoch": 0.5187635160921003, + "grad_norm": 1.7204842567443848, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8576488494873047, + "num_tokens": 155688175.0, + "step": 4078 + }, + { + "epoch": 0.5188907263706908, + "grad_norm": 1.61116623878479, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8602306842803955, + "num_tokens": 155722029.0, + "step": 4079 + }, + { + "epoch": 0.5190179366492813, + "grad_norm": 1.3541674613952637, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8678697943687439, + "num_tokens": 155768641.0, + "step": 4080 + }, + { + "epoch": 0.5191451469278717, + "grad_norm": 1.478312373161316, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8508912324905396, + "num_tokens": 155809459.0, + "step": 4081 + }, + { + "epoch": 0.5192723572064623, + "grad_norm": 1.5117332935333252, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8622447848320007, + "num_tokens": 155848366.0, + "step": 4082 + }, + { + "epoch": 0.5193995674850528, + "grad_norm": 1.4778621196746826, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.840186357498169, + "num_tokens": 155889777.0, + "step": 4083 + }, + { + "epoch": 0.5195267777636433, + "grad_norm": 1.46523118019104, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8589028120040894, + "num_tokens": 155934309.0, + "step": 4084 + }, + { + "epoch": 0.5196539880422338, + "grad_norm": 1.4533685445785522, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8693419694900513, + "num_tokens": 155973267.0, + "step": 4085 + }, + { + "epoch": 0.5197811983208244, + "grad_norm": 1.6053481101989746, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8645061254501343, + "num_tokens": 156006390.0, + "step": 4086 + }, + { + "epoch": 0.5199084085994148, + "grad_norm": 1.6093913316726685, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8552931547164917, + "num_tokens": 156045820.0, + "step": 4087 + }, + { + "epoch": 0.5200356188780053, + "grad_norm": 1.474432349205017, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8674767017364502, + "num_tokens": 156085269.0, + "step": 4088 + }, + { + "epoch": 0.5201628291565958, + "grad_norm": 1.470033884048462, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8577306270599365, + "num_tokens": 156127891.0, + "step": 4089 + }, + { + "epoch": 0.5202900394351864, + "grad_norm": 1.6325222253799438, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8680453300476074, + "num_tokens": 156161943.0, + "step": 4090 + }, + { + "epoch": 0.5204172497137769, + "grad_norm": 1.4025005102157593, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8829867839813232, + "num_tokens": 156199529.0, + "step": 4091 + }, + { + "epoch": 0.5205444599923674, + "grad_norm": 1.4746878147125244, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8658597469329834, + "num_tokens": 156242685.0, + "step": 4092 + }, + { + "epoch": 0.5206716702709578, + "grad_norm": 1.7041428089141846, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8462188839912415, + "num_tokens": 156273419.0, + "step": 4093 + }, + { + "epoch": 0.5207988805495484, + "grad_norm": 1.6494340896606445, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8531799912452698, + "num_tokens": 156311965.0, + "step": 4094 + }, + { + "epoch": 0.5209260908281389, + "grad_norm": 1.6026874780654907, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8552374839782715, + "num_tokens": 156351326.0, + "step": 4095 + }, + { + "epoch": 0.5210533011067294, + "grad_norm": 1.4869085550308228, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8621120452880859, + "num_tokens": 156387379.0, + "step": 4096 + }, + { + "epoch": 0.52118051138532, + "grad_norm": 1.6661008596420288, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8638441562652588, + "num_tokens": 156419181.0, + "step": 4097 + }, + { + "epoch": 0.5213077216639105, + "grad_norm": 1.512581706047058, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8704620003700256, + "num_tokens": 156456665.0, + "step": 4098 + }, + { + "epoch": 0.5214349319425009, + "grad_norm": 1.5524765253067017, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8634313941001892, + "num_tokens": 156489560.0, + "step": 4099 + }, + { + "epoch": 0.5215621422210914, + "grad_norm": 1.6172839403152466, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8727784156799316, + "num_tokens": 156521707.0, + "step": 4100 + }, + { + "epoch": 0.521689352499682, + "grad_norm": 1.6456550359725952, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.843975305557251, + "num_tokens": 156557992.0, + "step": 4101 + }, + { + "epoch": 0.5218165627782725, + "grad_norm": 1.609566569328308, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8706162571907043, + "num_tokens": 156597664.0, + "step": 4102 + }, + { + "epoch": 0.521943773056863, + "grad_norm": 1.610033631324768, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8621690273284912, + "num_tokens": 156641152.0, + "step": 4103 + }, + { + "epoch": 0.5220709833354535, + "grad_norm": 1.6653072834014893, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8735508918762207, + "num_tokens": 156674286.0, + "step": 4104 + }, + { + "epoch": 0.522198193614044, + "grad_norm": 1.4189372062683105, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8544243574142456, + "num_tokens": 156718495.0, + "step": 4105 + }, + { + "epoch": 0.5223254038926345, + "grad_norm": 1.5202313661575317, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.876347005367279, + "num_tokens": 156753129.0, + "step": 4106 + }, + { + "epoch": 0.522452614171225, + "grad_norm": 1.690784215927124, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8761975765228271, + "num_tokens": 156784897.0, + "step": 4107 + }, + { + "epoch": 0.5225798244498155, + "grad_norm": 1.5368337631225586, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8512467741966248, + "num_tokens": 156826709.0, + "step": 4108 + }, + { + "epoch": 0.5227070347284061, + "grad_norm": 1.5922642946243286, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8588380217552185, + "num_tokens": 156864301.0, + "step": 4109 + }, + { + "epoch": 0.5228342450069966, + "grad_norm": 1.521016001701355, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8704986572265625, + "num_tokens": 156901217.0, + "step": 4110 + }, + { + "epoch": 0.5229614552855871, + "grad_norm": 1.467199683189392, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8407076597213745, + "num_tokens": 156945359.0, + "step": 4111 + }, + { + "epoch": 0.5230886655641775, + "grad_norm": 1.4579486846923828, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.869080662727356, + "num_tokens": 156982857.0, + "step": 4112 + }, + { + "epoch": 0.5232158758427681, + "grad_norm": 1.5485342741012573, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8613108992576599, + "num_tokens": 157020179.0, + "step": 4113 + }, + { + "epoch": 0.5233430861213586, + "grad_norm": 1.605490803718567, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8618582487106323, + "num_tokens": 157055762.0, + "step": 4114 + }, + { + "epoch": 0.5234702963999491, + "grad_norm": 1.540221095085144, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8812475204467773, + "num_tokens": 157089424.0, + "step": 4115 + }, + { + "epoch": 0.5235975066785397, + "grad_norm": 1.4222326278686523, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.851719081401825, + "num_tokens": 157136009.0, + "step": 4116 + }, + { + "epoch": 0.5237247169571302, + "grad_norm": 1.5318949222564697, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.85625159740448, + "num_tokens": 157174147.0, + "step": 4117 + }, + { + "epoch": 0.5238519272357206, + "grad_norm": 1.4451605081558228, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8700184226036072, + "num_tokens": 157215474.0, + "step": 4118 + }, + { + "epoch": 0.5239791375143111, + "grad_norm": 1.5536534786224365, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8790668249130249, + "num_tokens": 157256390.0, + "step": 4119 + }, + { + "epoch": 0.5241063477929017, + "grad_norm": 1.5557522773742676, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8697239756584167, + "num_tokens": 157293221.0, + "step": 4120 + }, + { + "epoch": 0.5242335580714922, + "grad_norm": 1.3914735317230225, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8652483224868774, + "num_tokens": 157335277.0, + "step": 4121 + }, + { + "epoch": 0.5243607683500827, + "grad_norm": 1.5609400272369385, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8622486591339111, + "num_tokens": 157372348.0, + "step": 4122 + }, + { + "epoch": 0.5244879786286732, + "grad_norm": 1.5448580980300903, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8778811693191528, + "num_tokens": 157406811.0, + "step": 4123 + }, + { + "epoch": 0.5246151889072637, + "grad_norm": 1.401476263999939, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8788821697235107, + "num_tokens": 157447239.0, + "step": 4124 + }, + { + "epoch": 0.5247423991858542, + "grad_norm": 1.4351730346679688, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8487132787704468, + "num_tokens": 157493486.0, + "step": 4125 + }, + { + "epoch": 0.5248696094644447, + "grad_norm": 1.4651377201080322, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8675133585929871, + "num_tokens": 157529566.0, + "step": 4126 + }, + { + "epoch": 0.5249968197430352, + "grad_norm": 1.5449298620224, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8689831495285034, + "num_tokens": 157563485.0, + "step": 4127 + }, + { + "epoch": 0.5251240300216258, + "grad_norm": 1.5440772771835327, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8600601553916931, + "num_tokens": 157603315.0, + "step": 4128 + }, + { + "epoch": 0.5252512403002163, + "grad_norm": 1.3741356134414673, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.877895712852478, + "num_tokens": 157644230.0, + "step": 4129 + }, + { + "epoch": 0.5253784505788067, + "grad_norm": 1.5215425491333008, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8594756722450256, + "num_tokens": 157682013.0, + "step": 4130 + }, + { + "epoch": 0.5255056608573972, + "grad_norm": 1.757733941078186, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8626737594604492, + "num_tokens": 157714190.0, + "step": 4131 + }, + { + "epoch": 0.5256328711359878, + "grad_norm": 1.5917123556137085, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8827099204063416, + "num_tokens": 157745738.0, + "step": 4132 + }, + { + "epoch": 0.5257600814145783, + "grad_norm": 1.4842756986618042, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.869051456451416, + "num_tokens": 157784519.0, + "step": 4133 + }, + { + "epoch": 0.5258872916931688, + "grad_norm": 1.6222203969955444, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8649339079856873, + "num_tokens": 157816182.0, + "step": 4134 + }, + { + "epoch": 0.5260145019717594, + "grad_norm": 1.414272427558899, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8733419179916382, + "num_tokens": 157854157.0, + "step": 4135 + }, + { + "epoch": 0.5261417122503498, + "grad_norm": 1.5183300971984863, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8666015267372131, + "num_tokens": 157889547.0, + "step": 4136 + }, + { + "epoch": 0.5262689225289403, + "grad_norm": 1.4711073637008667, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8680378198623657, + "num_tokens": 157929564.0, + "step": 4137 + }, + { + "epoch": 0.5263961328075308, + "grad_norm": 1.5201841592788696, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8707420825958252, + "num_tokens": 157968660.0, + "step": 4138 + }, + { + "epoch": 0.5265233430861214, + "grad_norm": 1.4770561456680298, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8690952062606812, + "num_tokens": 158004489.0, + "step": 4139 + }, + { + "epoch": 0.5266505533647119, + "grad_norm": 1.468531847000122, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8686316013336182, + "num_tokens": 158041857.0, + "step": 4140 + }, + { + "epoch": 0.5267777636433024, + "grad_norm": 1.5152034759521484, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8705556988716125, + "num_tokens": 158079713.0, + "step": 4141 + }, + { + "epoch": 0.5269049739218928, + "grad_norm": 1.4812997579574585, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8625091910362244, + "num_tokens": 158120352.0, + "step": 4142 + }, + { + "epoch": 0.5270321842004834, + "grad_norm": 1.3792585134506226, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8638309836387634, + "num_tokens": 158161236.0, + "step": 4143 + }, + { + "epoch": 0.5271593944790739, + "grad_norm": 1.4062583446502686, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8644689321517944, + "num_tokens": 158204933.0, + "step": 4144 + }, + { + "epoch": 0.5272866047576644, + "grad_norm": 1.539483904838562, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8528703451156616, + "num_tokens": 158242922.0, + "step": 4145 + }, + { + "epoch": 0.527413815036255, + "grad_norm": 1.611693024635315, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8596802353858948, + "num_tokens": 158277346.0, + "step": 4146 + }, + { + "epoch": 0.5275410253148455, + "grad_norm": 1.526247262954712, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.859465479850769, + "num_tokens": 158316091.0, + "step": 4147 + }, + { + "epoch": 0.5276682355934359, + "grad_norm": 1.5198163986206055, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8647040128707886, + "num_tokens": 158353615.0, + "step": 4148 + }, + { + "epoch": 0.5277954458720264, + "grad_norm": 1.5612977743148804, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8577804565429688, + "num_tokens": 158394005.0, + "step": 4149 + }, + { + "epoch": 0.527922656150617, + "grad_norm": 1.5143728256225586, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.870150089263916, + "num_tokens": 158430645.0, + "step": 4150 + }, + { + "epoch": 0.5280498664292075, + "grad_norm": 1.5059280395507812, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8579782247543335, + "num_tokens": 158470191.0, + "step": 4151 + }, + { + "epoch": 0.528177076707798, + "grad_norm": 1.4908803701400757, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8498032093048096, + "num_tokens": 158515054.0, + "step": 4152 + }, + { + "epoch": 0.5283042869863885, + "grad_norm": 1.5702595710754395, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8714513778686523, + "num_tokens": 158549904.0, + "step": 4153 + }, + { + "epoch": 0.528431497264979, + "grad_norm": 1.6523464918136597, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8629770874977112, + "num_tokens": 158579227.0, + "step": 4154 + }, + { + "epoch": 0.5285587075435695, + "grad_norm": 1.501538634300232, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8612666130065918, + "num_tokens": 158619179.0, + "step": 4155 + }, + { + "epoch": 0.52868591782216, + "grad_norm": 1.5072048902511597, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8621340990066528, + "num_tokens": 158658801.0, + "step": 4156 + }, + { + "epoch": 0.5288131281007505, + "grad_norm": 1.494990587234497, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8712717890739441, + "num_tokens": 158696988.0, + "step": 4157 + }, + { + "epoch": 0.5289403383793411, + "grad_norm": 1.4290568828582764, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.859357476234436, + "num_tokens": 158739740.0, + "step": 4158 + }, + { + "epoch": 0.5290675486579316, + "grad_norm": 1.3386105298995972, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8764557242393494, + "num_tokens": 158782488.0, + "step": 4159 + }, + { + "epoch": 0.5291947589365221, + "grad_norm": 1.5638127326965332, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8687083125114441, + "num_tokens": 158815970.0, + "step": 4160 + }, + { + "epoch": 0.5293219692151125, + "grad_norm": 1.5898290872573853, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8375855684280396, + "num_tokens": 158855143.0, + "step": 4161 + }, + { + "epoch": 0.5294491794937031, + "grad_norm": 1.577345371246338, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8677235841751099, + "num_tokens": 158889322.0, + "step": 4162 + }, + { + "epoch": 0.5295763897722936, + "grad_norm": 1.5624185800552368, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8667995929718018, + "num_tokens": 158925165.0, + "step": 4163 + }, + { + "epoch": 0.5297036000508841, + "grad_norm": 1.3997207880020142, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8689389228820801, + "num_tokens": 158965475.0, + "step": 4164 + }, + { + "epoch": 0.5298308103294747, + "grad_norm": 1.548849105834961, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8583389520645142, + "num_tokens": 159002662.0, + "step": 4165 + }, + { + "epoch": 0.5299580206080652, + "grad_norm": 1.6540464162826538, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8626196980476379, + "num_tokens": 159036180.0, + "step": 4166 + }, + { + "epoch": 0.5300852308866556, + "grad_norm": 1.6238739490509033, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8592011332511902, + "num_tokens": 159070661.0, + "step": 4167 + }, + { + "epoch": 0.5302124411652461, + "grad_norm": 1.5100839138031006, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8828179240226746, + "num_tokens": 159107348.0, + "step": 4168 + }, + { + "epoch": 0.5303396514438367, + "grad_norm": 1.3962286710739136, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8734238147735596, + "num_tokens": 159148028.0, + "step": 4169 + }, + { + "epoch": 0.5304668617224272, + "grad_norm": 1.7197377681732178, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.870309054851532, + "num_tokens": 159185831.0, + "step": 4170 + }, + { + "epoch": 0.5305940720010177, + "grad_norm": 1.4878945350646973, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8601436614990234, + "num_tokens": 159226096.0, + "step": 4171 + }, + { + "epoch": 0.5307212822796082, + "grad_norm": 1.5032365322113037, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8763816356658936, + "num_tokens": 159262687.0, + "step": 4172 + }, + { + "epoch": 0.5308484925581987, + "grad_norm": 1.6939729452133179, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8620116710662842, + "num_tokens": 159296956.0, + "step": 4173 + }, + { + "epoch": 0.5309757028367892, + "grad_norm": 1.4626718759536743, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8819859027862549, + "num_tokens": 159333320.0, + "step": 4174 + }, + { + "epoch": 0.5311029131153797, + "grad_norm": 1.5323668718338013, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8826394081115723, + "num_tokens": 159367477.0, + "step": 4175 + }, + { + "epoch": 0.5312301233939702, + "grad_norm": 1.4526808261871338, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8675619959831238, + "num_tokens": 159404901.0, + "step": 4176 + }, + { + "epoch": 0.5313573336725608, + "grad_norm": 1.593717098236084, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8614601492881775, + "num_tokens": 159441137.0, + "step": 4177 + }, + { + "epoch": 0.5314845439511513, + "grad_norm": 1.5661194324493408, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8598796725273132, + "num_tokens": 159479921.0, + "step": 4178 + }, + { + "epoch": 0.5316117542297417, + "grad_norm": 1.5925532579421997, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8567951321601868, + "num_tokens": 159522052.0, + "step": 4179 + }, + { + "epoch": 0.5317389645083322, + "grad_norm": 1.5903737545013428, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.875322163105011, + "num_tokens": 159557134.0, + "step": 4180 + }, + { + "epoch": 0.5318661747869228, + "grad_norm": 1.732008934020996, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8476815223693848, + "num_tokens": 159589549.0, + "step": 4181 + }, + { + "epoch": 0.5319933850655133, + "grad_norm": 1.5448464155197144, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8629539012908936, + "num_tokens": 159625707.0, + "step": 4182 + }, + { + "epoch": 0.5321205953441038, + "grad_norm": 1.5491901636123657, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8723909854888916, + "num_tokens": 159659935.0, + "step": 4183 + }, + { + "epoch": 0.5322478056226944, + "grad_norm": 1.386248230934143, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8694647550582886, + "num_tokens": 159700985.0, + "step": 4184 + }, + { + "epoch": 0.5323750159012848, + "grad_norm": 1.5020076036453247, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8534862399101257, + "num_tokens": 159737102.0, + "step": 4185 + }, + { + "epoch": 0.5325022261798753, + "grad_norm": 1.4066628217697144, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8647257089614868, + "num_tokens": 159780154.0, + "step": 4186 + }, + { + "epoch": 0.5326294364584658, + "grad_norm": 1.4565539360046387, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8521122932434082, + "num_tokens": 159822087.0, + "step": 4187 + }, + { + "epoch": 0.5327566467370564, + "grad_norm": 1.4420411586761475, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.875243604183197, + "num_tokens": 159860421.0, + "step": 4188 + }, + { + "epoch": 0.5328838570156469, + "grad_norm": 1.4086365699768066, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.870406985282898, + "num_tokens": 159906207.0, + "step": 4189 + }, + { + "epoch": 0.5330110672942374, + "grad_norm": 1.4715213775634766, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8731005191802979, + "num_tokens": 159942611.0, + "step": 4190 + }, + { + "epoch": 0.5331382775728278, + "grad_norm": 1.5800482034683228, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8559975624084473, + "num_tokens": 159979988.0, + "step": 4191 + }, + { + "epoch": 0.5332654878514184, + "grad_norm": 1.4875562191009521, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8799493312835693, + "num_tokens": 160016156.0, + "step": 4192 + }, + { + "epoch": 0.5333926981300089, + "grad_norm": 1.37191903591156, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8726376891136169, + "num_tokens": 160057045.0, + "step": 4193 + }, + { + "epoch": 0.5335199084085994, + "grad_norm": 1.5891644954681396, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8696518540382385, + "num_tokens": 160089063.0, + "step": 4194 + }, + { + "epoch": 0.53364711868719, + "grad_norm": 1.561639428138733, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8783379793167114, + "num_tokens": 160122009.0, + "step": 4195 + }, + { + "epoch": 0.5337743289657805, + "grad_norm": 1.5569169521331787, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8690856695175171, + "num_tokens": 160159606.0, + "step": 4196 + }, + { + "epoch": 0.5339015392443709, + "grad_norm": 1.4988199472427368, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8650014996528625, + "num_tokens": 160197969.0, + "step": 4197 + }, + { + "epoch": 0.5340287495229614, + "grad_norm": 1.5810672044754028, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8514161109924316, + "num_tokens": 160237420.0, + "step": 4198 + }, + { + "epoch": 0.534155959801552, + "grad_norm": 1.5147697925567627, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8521663546562195, + "num_tokens": 160276065.0, + "step": 4199 + }, + { + "epoch": 0.5342831700801425, + "grad_norm": 1.5640281438827515, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8550519943237305, + "num_tokens": 160313849.0, + "step": 4200 + }, + { + "epoch": 0.534410380358733, + "grad_norm": 1.38382089138031, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8594324588775635, + "num_tokens": 160357299.0, + "step": 4201 + }, + { + "epoch": 0.5345375906373235, + "grad_norm": 1.5253639221191406, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8572733402252197, + "num_tokens": 160397409.0, + "step": 4202 + }, + { + "epoch": 0.534664800915914, + "grad_norm": 1.580915927886963, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8730703592300415, + "num_tokens": 160430955.0, + "step": 4203 + }, + { + "epoch": 0.5347920111945045, + "grad_norm": 1.5628039836883545, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8672873973846436, + "num_tokens": 160465614.0, + "step": 4204 + }, + { + "epoch": 0.534919221473095, + "grad_norm": 1.4271268844604492, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8661950826644897, + "num_tokens": 160505386.0, + "step": 4205 + }, + { + "epoch": 0.5350464317516855, + "grad_norm": 1.4753077030181885, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8584535121917725, + "num_tokens": 160547834.0, + "step": 4206 + }, + { + "epoch": 0.5351736420302761, + "grad_norm": 1.6538349390029907, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8666612505912781, + "num_tokens": 160587928.0, + "step": 4207 + }, + { + "epoch": 0.5353008523088666, + "grad_norm": 1.5565552711486816, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8707647323608398, + "num_tokens": 160621263.0, + "step": 4208 + }, + { + "epoch": 0.5354280625874571, + "grad_norm": 1.6463459730148315, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8564260005950928, + "num_tokens": 160660105.0, + "step": 4209 + }, + { + "epoch": 0.5355552728660475, + "grad_norm": 1.5757477283477783, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8540738224983215, + "num_tokens": 160698946.0, + "step": 4210 + }, + { + "epoch": 0.5356824831446381, + "grad_norm": 1.5577342510223389, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8717384934425354, + "num_tokens": 160738112.0, + "step": 4211 + }, + { + "epoch": 0.5358096934232286, + "grad_norm": 1.4856325387954712, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.857474684715271, + "num_tokens": 160780907.0, + "step": 4212 + }, + { + "epoch": 0.5359369037018191, + "grad_norm": 1.5728789567947388, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8411996364593506, + "num_tokens": 160824328.0, + "step": 4213 + }, + { + "epoch": 0.5360641139804097, + "grad_norm": 1.5504506826400757, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8607069849967957, + "num_tokens": 160864793.0, + "step": 4214 + }, + { + "epoch": 0.5361913242590002, + "grad_norm": 1.487722635269165, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8653232455253601, + "num_tokens": 160900482.0, + "step": 4215 + }, + { + "epoch": 0.5363185345375906, + "grad_norm": 1.566177487373352, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8545125722885132, + "num_tokens": 160938471.0, + "step": 4216 + }, + { + "epoch": 0.5364457448161811, + "grad_norm": 1.628143072128296, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8572083115577698, + "num_tokens": 160978738.0, + "step": 4217 + }, + { + "epoch": 0.5365729550947717, + "grad_norm": 1.6575599908828735, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8548558354377747, + "num_tokens": 161014609.0, + "step": 4218 + }, + { + "epoch": 0.5367001653733622, + "grad_norm": 1.4641658067703247, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8701871037483215, + "num_tokens": 161050590.0, + "step": 4219 + }, + { + "epoch": 0.5368273756519527, + "grad_norm": 1.5527814626693726, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8491814136505127, + "num_tokens": 161089011.0, + "step": 4220 + }, + { + "epoch": 0.5369545859305432, + "grad_norm": 1.6825038194656372, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8523741960525513, + "num_tokens": 161122317.0, + "step": 4221 + }, + { + "epoch": 0.5370817962091337, + "grad_norm": 1.515380859375, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8546622395515442, + "num_tokens": 161161484.0, + "step": 4222 + }, + { + "epoch": 0.5372090064877242, + "grad_norm": 1.570523738861084, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8505005240440369, + "num_tokens": 161202268.0, + "step": 4223 + }, + { + "epoch": 0.5373362167663147, + "grad_norm": 1.5940629243850708, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8409495949745178, + "num_tokens": 161239784.0, + "step": 4224 + }, + { + "epoch": 0.5374634270449052, + "grad_norm": 1.5225030183792114, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8620085716247559, + "num_tokens": 161280108.0, + "step": 4225 + }, + { + "epoch": 0.5375906373234958, + "grad_norm": 1.4419504404067993, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8562314510345459, + "num_tokens": 161321977.0, + "step": 4226 + }, + { + "epoch": 0.5377178476020863, + "grad_norm": 1.418810248374939, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8736112117767334, + "num_tokens": 161358768.0, + "step": 4227 + }, + { + "epoch": 0.5378450578806767, + "grad_norm": 1.6401299238204956, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8531307578086853, + "num_tokens": 161395298.0, + "step": 4228 + }, + { + "epoch": 0.5379722681592672, + "grad_norm": 1.4677869081497192, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8730055689811707, + "num_tokens": 161435961.0, + "step": 4229 + }, + { + "epoch": 0.5380994784378578, + "grad_norm": 1.6580380201339722, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.852668046951294, + "num_tokens": 161472085.0, + "step": 4230 + }, + { + "epoch": 0.5382266887164483, + "grad_norm": 1.4845081567764282, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8704087734222412, + "num_tokens": 161508380.0, + "step": 4231 + }, + { + "epoch": 0.5383538989950388, + "grad_norm": 1.5492392778396606, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8754563331604004, + "num_tokens": 161544589.0, + "step": 4232 + }, + { + "epoch": 0.5384811092736294, + "grad_norm": 1.5584248304367065, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8608721494674683, + "num_tokens": 161581126.0, + "step": 4233 + }, + { + "epoch": 0.5386083195522198, + "grad_norm": 1.5587267875671387, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.842376708984375, + "num_tokens": 161615684.0, + "step": 4234 + }, + { + "epoch": 0.5387355298308103, + "grad_norm": 1.583709478378296, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8508301973342896, + "num_tokens": 161656449.0, + "step": 4235 + }, + { + "epoch": 0.5388627401094008, + "grad_norm": 1.408191204071045, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8659015893936157, + "num_tokens": 161697597.0, + "step": 4236 + }, + { + "epoch": 0.5389899503879914, + "grad_norm": 1.5024809837341309, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8556381464004517, + "num_tokens": 161738838.0, + "step": 4237 + }, + { + "epoch": 0.5391171606665819, + "grad_norm": 1.5593703985214233, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8536003828048706, + "num_tokens": 161773987.0, + "step": 4238 + }, + { + "epoch": 0.5392443709451724, + "grad_norm": 1.4940526485443115, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8574079871177673, + "num_tokens": 161812604.0, + "step": 4239 + }, + { + "epoch": 0.5393715812237628, + "grad_norm": 1.4950486421585083, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8629748821258545, + "num_tokens": 161850641.0, + "step": 4240 + }, + { + "epoch": 0.5394987915023534, + "grad_norm": 1.410569667816162, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8601974248886108, + "num_tokens": 161891510.0, + "step": 4241 + }, + { + "epoch": 0.5396260017809439, + "grad_norm": 1.420222282409668, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8725569248199463, + "num_tokens": 161928370.0, + "step": 4242 + }, + { + "epoch": 0.5397532120595344, + "grad_norm": 1.5237176418304443, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8452330827713013, + "num_tokens": 161965900.0, + "step": 4243 + }, + { + "epoch": 0.5398804223381249, + "grad_norm": 1.3770824670791626, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8776071071624756, + "num_tokens": 162007438.0, + "step": 4244 + }, + { + "epoch": 0.5400076326167155, + "grad_norm": 1.4763221740722656, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8502897024154663, + "num_tokens": 162049094.0, + "step": 4245 + }, + { + "epoch": 0.5401348428953059, + "grad_norm": 1.5409166812896729, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.870233952999115, + "num_tokens": 162085177.0, + "step": 4246 + }, + { + "epoch": 0.5402620531738964, + "grad_norm": 1.4997308254241943, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8702313899993896, + "num_tokens": 162121228.0, + "step": 4247 + }, + { + "epoch": 0.540389263452487, + "grad_norm": 1.4808988571166992, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8751894235610962, + "num_tokens": 162159736.0, + "step": 4248 + }, + { + "epoch": 0.5405164737310775, + "grad_norm": 1.411085605621338, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8683667182922363, + "num_tokens": 162203081.0, + "step": 4249 + }, + { + "epoch": 0.540643684009668, + "grad_norm": 1.4793243408203125, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8693392276763916, + "num_tokens": 162242272.0, + "step": 4250 + }, + { + "epoch": 0.5407708942882585, + "grad_norm": 1.5755952596664429, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8714737296104431, + "num_tokens": 162276353.0, + "step": 4251 + }, + { + "epoch": 0.540898104566849, + "grad_norm": 1.5530285835266113, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8648946285247803, + "num_tokens": 162313017.0, + "step": 4252 + }, + { + "epoch": 0.5410253148454395, + "grad_norm": 1.3888524770736694, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8611325025558472, + "num_tokens": 162359809.0, + "step": 4253 + }, + { + "epoch": 0.54115252512403, + "grad_norm": 1.4942741394042969, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8588349223136902, + "num_tokens": 162400367.0, + "step": 4254 + }, + { + "epoch": 0.5412797354026205, + "grad_norm": 1.501695990562439, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8600966930389404, + "num_tokens": 162442009.0, + "step": 4255 + }, + { + "epoch": 0.5414069456812111, + "grad_norm": 1.6887937784194946, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.86528080701828, + "num_tokens": 162478889.0, + "step": 4256 + }, + { + "epoch": 0.5415341559598016, + "grad_norm": 1.5262037515640259, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8573452234268188, + "num_tokens": 162520758.0, + "step": 4257 + }, + { + "epoch": 0.5416613662383921, + "grad_norm": 1.5825791358947754, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8572885990142822, + "num_tokens": 162557533.0, + "step": 4258 + }, + { + "epoch": 0.5417885765169825, + "grad_norm": 1.3633145093917847, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8737305402755737, + "num_tokens": 162601033.0, + "step": 4259 + }, + { + "epoch": 0.5419157867955731, + "grad_norm": 1.5176125764846802, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.864560604095459, + "num_tokens": 162640740.0, + "step": 4260 + }, + { + "epoch": 0.5420429970741636, + "grad_norm": 1.7294909954071045, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8548482060432434, + "num_tokens": 162671266.0, + "step": 4261 + }, + { + "epoch": 0.5421702073527541, + "grad_norm": 1.4662842750549316, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8656091094017029, + "num_tokens": 162708957.0, + "step": 4262 + }, + { + "epoch": 0.5422974176313446, + "grad_norm": 1.41005277633667, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8663152456283569, + "num_tokens": 162749791.0, + "step": 4263 + }, + { + "epoch": 0.5424246279099352, + "grad_norm": 1.5027499198913574, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.875430703163147, + "num_tokens": 162782121.0, + "step": 4264 + }, + { + "epoch": 0.5425518381885256, + "grad_norm": 1.5905725955963135, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8631407618522644, + "num_tokens": 162821392.0, + "step": 4265 + }, + { + "epoch": 0.5426790484671161, + "grad_norm": 1.659936547279358, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8553023338317871, + "num_tokens": 162853436.0, + "step": 4266 + }, + { + "epoch": 0.5428062587457066, + "grad_norm": 1.6730321645736694, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8705620169639587, + "num_tokens": 162884083.0, + "step": 4267 + }, + { + "epoch": 0.5429334690242972, + "grad_norm": 1.4650198221206665, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8697618246078491, + "num_tokens": 162919840.0, + "step": 4268 + }, + { + "epoch": 0.5430606793028877, + "grad_norm": 1.532357931137085, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8655612468719482, + "num_tokens": 162960299.0, + "step": 4269 + }, + { + "epoch": 0.5431878895814782, + "grad_norm": 1.4695513248443604, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8599722385406494, + "num_tokens": 163003606.0, + "step": 4270 + }, + { + "epoch": 0.5433150998600687, + "grad_norm": 1.5397799015045166, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8641411662101746, + "num_tokens": 163041326.0, + "step": 4271 + }, + { + "epoch": 0.5434423101386592, + "grad_norm": 1.4197144508361816, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8669383525848389, + "num_tokens": 163085375.0, + "step": 4272 + }, + { + "epoch": 0.5435695204172497, + "grad_norm": 1.426814317703247, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.868922233581543, + "num_tokens": 163124974.0, + "step": 4273 + }, + { + "epoch": 0.5436967306958402, + "grad_norm": 1.5056805610656738, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8699036836624146, + "num_tokens": 163159109.0, + "step": 4274 + }, + { + "epoch": 0.5438239409744308, + "grad_norm": 1.612898349761963, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8537718057632446, + "num_tokens": 163194729.0, + "step": 4275 + }, + { + "epoch": 0.5439511512530213, + "grad_norm": 1.6465553045272827, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8634427189826965, + "num_tokens": 163229372.0, + "step": 4276 + }, + { + "epoch": 0.5440783615316117, + "grad_norm": 1.4575910568237305, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8752880096435547, + "num_tokens": 163265798.0, + "step": 4277 + }, + { + "epoch": 0.5442055718102022, + "grad_norm": 1.5101158618927002, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8539265990257263, + "num_tokens": 163303851.0, + "step": 4278 + }, + { + "epoch": 0.5443327820887928, + "grad_norm": 1.4682537317276, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8633407950401306, + "num_tokens": 163341340.0, + "step": 4279 + }, + { + "epoch": 0.5444599923673833, + "grad_norm": 1.5170599222183228, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8641672134399414, + "num_tokens": 163380165.0, + "step": 4280 + }, + { + "epoch": 0.5445872026459738, + "grad_norm": 1.628303050994873, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8507283329963684, + "num_tokens": 163419041.0, + "step": 4281 + }, + { + "epoch": 0.5447144129245644, + "grad_norm": 1.4966716766357422, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.860476016998291, + "num_tokens": 163459604.0, + "step": 4282 + }, + { + "epoch": 0.5448416232031548, + "grad_norm": 1.5641918182373047, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8691840767860413, + "num_tokens": 163495297.0, + "step": 4283 + }, + { + "epoch": 0.5449688334817453, + "grad_norm": 1.509351372718811, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8718234896659851, + "num_tokens": 163528156.0, + "step": 4284 + }, + { + "epoch": 0.5450960437603358, + "grad_norm": 1.5576410293579102, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8693112134933472, + "num_tokens": 163563231.0, + "step": 4285 + }, + { + "epoch": 0.5452232540389264, + "grad_norm": 1.4145644903182983, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8700359463691711, + "num_tokens": 163601666.0, + "step": 4286 + }, + { + "epoch": 0.5453504643175169, + "grad_norm": 1.5782225131988525, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8664072751998901, + "num_tokens": 163634438.0, + "step": 4287 + }, + { + "epoch": 0.5454776745961074, + "grad_norm": 1.4550780057907104, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8600195646286011, + "num_tokens": 163678405.0, + "step": 4288 + }, + { + "epoch": 0.5456048848746978, + "grad_norm": 1.6220109462738037, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8596565127372742, + "num_tokens": 163712751.0, + "step": 4289 + }, + { + "epoch": 0.5457320951532884, + "grad_norm": 1.7240041494369507, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8512317538261414, + "num_tokens": 163744458.0, + "step": 4290 + }, + { + "epoch": 0.5458593054318789, + "grad_norm": 1.6080598831176758, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8608655333518982, + "num_tokens": 163777265.0, + "step": 4291 + }, + { + "epoch": 0.5459865157104694, + "grad_norm": 1.5088074207305908, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8753154277801514, + "num_tokens": 163816160.0, + "step": 4292 + }, + { + "epoch": 0.5461137259890599, + "grad_norm": 1.5782268047332764, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8555803298950195, + "num_tokens": 163853038.0, + "step": 4293 + }, + { + "epoch": 0.5462409362676505, + "grad_norm": 1.6148468255996704, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.844770073890686, + "num_tokens": 163894479.0, + "step": 4294 + }, + { + "epoch": 0.5463681465462409, + "grad_norm": 1.5970209836959839, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8432915210723877, + "num_tokens": 163931441.0, + "step": 4295 + }, + { + "epoch": 0.5464953568248314, + "grad_norm": 1.2915188074111938, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8787136673927307, + "num_tokens": 163979507.0, + "step": 4296 + }, + { + "epoch": 0.5466225671034219, + "grad_norm": 1.4778228998184204, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8639540076255798, + "num_tokens": 164022013.0, + "step": 4297 + }, + { + "epoch": 0.5467497773820125, + "grad_norm": 1.4973646402359009, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8612349033355713, + "num_tokens": 164058227.0, + "step": 4298 + }, + { + "epoch": 0.546876987660603, + "grad_norm": 1.568886160850525, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8576853275299072, + "num_tokens": 164096576.0, + "step": 4299 + }, + { + "epoch": 0.5470041979391935, + "grad_norm": 1.4500921964645386, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8691117167472839, + "num_tokens": 164137071.0, + "step": 4300 + }, + { + "epoch": 0.5471314082177839, + "grad_norm": 1.373547911643982, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8630211353302002, + "num_tokens": 164185603.0, + "step": 4301 + }, + { + "epoch": 0.5472586184963745, + "grad_norm": 1.5739728212356567, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8505063056945801, + "num_tokens": 164226048.0, + "step": 4302 + }, + { + "epoch": 0.547385828774965, + "grad_norm": 1.474869728088379, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8587320446968079, + "num_tokens": 164270547.0, + "step": 4303 + }, + { + "epoch": 0.5475130390535555, + "grad_norm": 1.51848566532135, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8666974902153015, + "num_tokens": 164306703.0, + "step": 4304 + }, + { + "epoch": 0.5476402493321461, + "grad_norm": 1.4344732761383057, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8689272403717041, + "num_tokens": 164346569.0, + "step": 4305 + }, + { + "epoch": 0.5477674596107366, + "grad_norm": 1.4942779541015625, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8475093841552734, + "num_tokens": 164385574.0, + "step": 4306 + }, + { + "epoch": 0.5478946698893271, + "grad_norm": 1.43767249584198, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8759410381317139, + "num_tokens": 164426639.0, + "step": 4307 + }, + { + "epoch": 0.5480218801679175, + "grad_norm": 1.5385926961898804, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8591634035110474, + "num_tokens": 164462904.0, + "step": 4308 + }, + { + "epoch": 0.5481490904465081, + "grad_norm": 1.3471858501434326, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8591232299804688, + "num_tokens": 164505745.0, + "step": 4309 + }, + { + "epoch": 0.5482763007250986, + "grad_norm": 1.6110403537750244, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8598026037216187, + "num_tokens": 164538598.0, + "step": 4310 + }, + { + "epoch": 0.5484035110036891, + "grad_norm": 1.4881552457809448, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8590091466903687, + "num_tokens": 164579761.0, + "step": 4311 + }, + { + "epoch": 0.5485307212822796, + "grad_norm": 1.675437092781067, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8625888228416443, + "num_tokens": 164611007.0, + "step": 4312 + }, + { + "epoch": 0.5486579315608702, + "grad_norm": 1.5058116912841797, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8511485457420349, + "num_tokens": 164655587.0, + "step": 4313 + }, + { + "epoch": 0.5487851418394606, + "grad_norm": 1.5101732015609741, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8636155128479004, + "num_tokens": 164693426.0, + "step": 4314 + }, + { + "epoch": 0.5489123521180511, + "grad_norm": 1.583949089050293, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8617173433303833, + "num_tokens": 164725991.0, + "step": 4315 + }, + { + "epoch": 0.5490395623966416, + "grad_norm": 1.5441375970840454, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8610614538192749, + "num_tokens": 164760776.0, + "step": 4316 + }, + { + "epoch": 0.5491667726752322, + "grad_norm": 1.5821605920791626, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8666088581085205, + "num_tokens": 164791712.0, + "step": 4317 + }, + { + "epoch": 0.5492939829538227, + "grad_norm": 1.4228720664978027, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8565533757209778, + "num_tokens": 164832919.0, + "step": 4318 + }, + { + "epoch": 0.5494211932324132, + "grad_norm": 1.3671441078186035, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8679170608520508, + "num_tokens": 164874472.0, + "step": 4319 + }, + { + "epoch": 0.5495484035110036, + "grad_norm": 1.527186393737793, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8672416806221008, + "num_tokens": 164911136.0, + "step": 4320 + }, + { + "epoch": 0.5496756137895942, + "grad_norm": 1.4946998357772827, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.864228367805481, + "num_tokens": 164946603.0, + "step": 4321 + }, + { + "epoch": 0.5498028240681847, + "grad_norm": 1.5629119873046875, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8572866916656494, + "num_tokens": 164981457.0, + "step": 4322 + }, + { + "epoch": 0.5499300343467752, + "grad_norm": 1.5471941232681274, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8515918254852295, + "num_tokens": 165018380.0, + "step": 4323 + }, + { + "epoch": 0.5500572446253658, + "grad_norm": 1.4858647584915161, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8676799535751343, + "num_tokens": 165056600.0, + "step": 4324 + }, + { + "epoch": 0.5501844549039563, + "grad_norm": 1.6737160682678223, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8626053333282471, + "num_tokens": 165091177.0, + "step": 4325 + }, + { + "epoch": 0.5503116651825467, + "grad_norm": 1.451540470123291, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8724976778030396, + "num_tokens": 165129446.0, + "step": 4326 + }, + { + "epoch": 0.5504388754611372, + "grad_norm": 1.3407799005508423, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8731448650360107, + "num_tokens": 165174503.0, + "step": 4327 + }, + { + "epoch": 0.5505660857397278, + "grad_norm": 1.5849536657333374, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8607993125915527, + "num_tokens": 165211621.0, + "step": 4328 + }, + { + "epoch": 0.5506932960183183, + "grad_norm": 1.4115087985992432, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8714098334312439, + "num_tokens": 165253115.0, + "step": 4329 + }, + { + "epoch": 0.5508205062969088, + "grad_norm": 1.6360543966293335, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8417240381240845, + "num_tokens": 165290245.0, + "step": 4330 + }, + { + "epoch": 0.5509477165754993, + "grad_norm": 1.4600260257720947, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8730930685997009, + "num_tokens": 165329788.0, + "step": 4331 + }, + { + "epoch": 0.5510749268540898, + "grad_norm": 1.3908461332321167, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8773132562637329, + "num_tokens": 165371700.0, + "step": 4332 + }, + { + "epoch": 0.5512021371326803, + "grad_norm": 1.5430364608764648, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.866239607334137, + "num_tokens": 165409752.0, + "step": 4333 + }, + { + "epoch": 0.5513293474112708, + "grad_norm": 1.6435813903808594, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8397048115730286, + "num_tokens": 165447929.0, + "step": 4334 + }, + { + "epoch": 0.5514565576898613, + "grad_norm": 1.5089139938354492, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8611882925033569, + "num_tokens": 165487076.0, + "step": 4335 + }, + { + "epoch": 0.5515837679684519, + "grad_norm": 1.5474129915237427, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8586371541023254, + "num_tokens": 165523895.0, + "step": 4336 + }, + { + "epoch": 0.5517109782470424, + "grad_norm": 1.570467472076416, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.872225284576416, + "num_tokens": 165556770.0, + "step": 4337 + }, + { + "epoch": 0.5518381885256328, + "grad_norm": 1.4717862606048584, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8657457828521729, + "num_tokens": 165598223.0, + "step": 4338 + }, + { + "epoch": 0.5519653988042234, + "grad_norm": 1.5927852392196655, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8616882562637329, + "num_tokens": 165633731.0, + "step": 4339 + }, + { + "epoch": 0.5520926090828139, + "grad_norm": 1.4560760259628296, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8795719146728516, + "num_tokens": 165670935.0, + "step": 4340 + }, + { + "epoch": 0.5522198193614044, + "grad_norm": 1.4032820463180542, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8733412623405457, + "num_tokens": 165709169.0, + "step": 4341 + }, + { + "epoch": 0.5523470296399949, + "grad_norm": 1.452571988105774, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8649308085441589, + "num_tokens": 165750150.0, + "step": 4342 + }, + { + "epoch": 0.5524742399185855, + "grad_norm": 1.6721765995025635, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8620195388793945, + "num_tokens": 165782670.0, + "step": 4343 + }, + { + "epoch": 0.5526014501971759, + "grad_norm": 1.5837416648864746, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8445389270782471, + "num_tokens": 165822526.0, + "step": 4344 + }, + { + "epoch": 0.5527286604757664, + "grad_norm": 1.6479709148406982, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8515315651893616, + "num_tokens": 165856324.0, + "step": 4345 + }, + { + "epoch": 0.5528558707543569, + "grad_norm": 1.6084222793579102, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8626834154129028, + "num_tokens": 165890182.0, + "step": 4346 + }, + { + "epoch": 0.5529830810329475, + "grad_norm": 1.739043116569519, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8805950880050659, + "num_tokens": 165923014.0, + "step": 4347 + }, + { + "epoch": 0.553110291311538, + "grad_norm": 1.4368540048599243, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8651068806648254, + "num_tokens": 165963561.0, + "step": 4348 + }, + { + "epoch": 0.5532375015901285, + "grad_norm": 1.4300395250320435, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8753539323806763, + "num_tokens": 165998700.0, + "step": 4349 + }, + { + "epoch": 0.5533647118687189, + "grad_norm": 1.5641226768493652, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8725639581680298, + "num_tokens": 166033424.0, + "step": 4350 + }, + { + "epoch": 0.5534919221473095, + "grad_norm": 1.6337759494781494, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.855536162853241, + "num_tokens": 166069755.0, + "step": 4351 + }, + { + "epoch": 0.5536191324259, + "grad_norm": 1.4695504903793335, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8758865594863892, + "num_tokens": 166108066.0, + "step": 4352 + }, + { + "epoch": 0.5537463427044905, + "grad_norm": 1.5278862714767456, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8638486862182617, + "num_tokens": 166146207.0, + "step": 4353 + }, + { + "epoch": 0.553873552983081, + "grad_norm": 1.5597954988479614, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.865639328956604, + "num_tokens": 166180762.0, + "step": 4354 + }, + { + "epoch": 0.5540007632616716, + "grad_norm": 1.5914719104766846, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8409954309463501, + "num_tokens": 166220609.0, + "step": 4355 + }, + { + "epoch": 0.554127973540262, + "grad_norm": 1.502058982849121, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8482692241668701, + "num_tokens": 166260454.0, + "step": 4356 + }, + { + "epoch": 0.5542551838188525, + "grad_norm": 1.3895679712295532, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8707060217857361, + "num_tokens": 166300811.0, + "step": 4357 + }, + { + "epoch": 0.554382394097443, + "grad_norm": 1.594902515411377, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8538019061088562, + "num_tokens": 166337683.0, + "step": 4358 + }, + { + "epoch": 0.5545096043760336, + "grad_norm": 1.489698052406311, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8756183385848999, + "num_tokens": 166373841.0, + "step": 4359 + }, + { + "epoch": 0.5546368146546241, + "grad_norm": 1.502817153930664, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8694864511489868, + "num_tokens": 166410933.0, + "step": 4360 + }, + { + "epoch": 0.5547640249332146, + "grad_norm": 1.6591092348098755, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8439847826957703, + "num_tokens": 166443188.0, + "step": 4361 + }, + { + "epoch": 0.5548912352118052, + "grad_norm": 1.4594855308532715, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8624868988990784, + "num_tokens": 166486515.0, + "step": 4362 + }, + { + "epoch": 0.5550184454903956, + "grad_norm": 1.4125194549560547, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.870667576789856, + "num_tokens": 166526664.0, + "step": 4363 + }, + { + "epoch": 0.5551456557689861, + "grad_norm": 1.559057354927063, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8716442584991455, + "num_tokens": 166562544.0, + "step": 4364 + }, + { + "epoch": 0.5552728660475766, + "grad_norm": 1.422399878501892, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8538315296173096, + "num_tokens": 166603463.0, + "step": 4365 + }, + { + "epoch": 0.5554000763261672, + "grad_norm": 1.5538169145584106, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8455743789672852, + "num_tokens": 166645034.0, + "step": 4366 + }, + { + "epoch": 0.5555272866047577, + "grad_norm": 1.7044241428375244, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8548371195793152, + "num_tokens": 166683715.0, + "step": 4367 + }, + { + "epoch": 0.5556544968833482, + "grad_norm": 1.5241029262542725, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.866721510887146, + "num_tokens": 166720900.0, + "step": 4368 + }, + { + "epoch": 0.5557817071619386, + "grad_norm": 1.3913490772247314, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8555554151535034, + "num_tokens": 166763194.0, + "step": 4369 + }, + { + "epoch": 0.5559089174405292, + "grad_norm": 1.5034055709838867, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8708821535110474, + "num_tokens": 166802575.0, + "step": 4370 + }, + { + "epoch": 0.5560361277191197, + "grad_norm": 1.7340940237045288, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8442081809043884, + "num_tokens": 166836914.0, + "step": 4371 + }, + { + "epoch": 0.5561633379977102, + "grad_norm": 1.586929440498352, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8494729995727539, + "num_tokens": 166873353.0, + "step": 4372 + }, + { + "epoch": 0.5562905482763008, + "grad_norm": 1.5456109046936035, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8573440313339233, + "num_tokens": 166909462.0, + "step": 4373 + }, + { + "epoch": 0.5564177585548913, + "grad_norm": 1.4535908699035645, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8745262622833252, + "num_tokens": 166946652.0, + "step": 4374 + }, + { + "epoch": 0.5565449688334817, + "grad_norm": 1.59026300907135, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8565840721130371, + "num_tokens": 166982424.0, + "step": 4375 + }, + { + "epoch": 0.5566721791120722, + "grad_norm": 1.776200532913208, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8615491390228271, + "num_tokens": 167017272.0, + "step": 4376 + }, + { + "epoch": 0.5567993893906628, + "grad_norm": 1.4487398862838745, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8534095883369446, + "num_tokens": 167060499.0, + "step": 4377 + }, + { + "epoch": 0.5569265996692533, + "grad_norm": 1.467980146408081, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8564721345901489, + "num_tokens": 167102277.0, + "step": 4378 + }, + { + "epoch": 0.5570538099478438, + "grad_norm": 1.5242602825164795, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8571242094039917, + "num_tokens": 167139161.0, + "step": 4379 + }, + { + "epoch": 0.5571810202264343, + "grad_norm": 1.5024747848510742, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8713561296463013, + "num_tokens": 167174465.0, + "step": 4380 + }, + { + "epoch": 0.5573082305050248, + "grad_norm": 1.5683363676071167, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.86185622215271, + "num_tokens": 167215603.0, + "step": 4381 + }, + { + "epoch": 0.5574354407836153, + "grad_norm": 1.5779839754104614, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8650930523872375, + "num_tokens": 167250496.0, + "step": 4382 + }, + { + "epoch": 0.5575626510622058, + "grad_norm": 1.5120527744293213, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8723018169403076, + "num_tokens": 167287851.0, + "step": 4383 + }, + { + "epoch": 0.5576898613407963, + "grad_norm": 1.5150401592254639, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8581625819206238, + "num_tokens": 167327633.0, + "step": 4384 + }, + { + "epoch": 0.5578170716193869, + "grad_norm": 1.4787166118621826, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8707029819488525, + "num_tokens": 167368574.0, + "step": 4385 + }, + { + "epoch": 0.5579442818979774, + "grad_norm": 1.4585931301116943, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8747357130050659, + "num_tokens": 167410520.0, + "step": 4386 + }, + { + "epoch": 0.5580714921765678, + "grad_norm": 1.4571497440338135, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8700669407844543, + "num_tokens": 167446596.0, + "step": 4387 + }, + { + "epoch": 0.5581987024551583, + "grad_norm": 1.4103572368621826, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.878113329410553, + "num_tokens": 167488802.0, + "step": 4388 + }, + { + "epoch": 0.5583259127337489, + "grad_norm": 1.5330227613449097, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8638367652893066, + "num_tokens": 167526714.0, + "step": 4389 + }, + { + "epoch": 0.5584531230123394, + "grad_norm": 1.5173020362854004, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8779003024101257, + "num_tokens": 167564661.0, + "step": 4390 + }, + { + "epoch": 0.5585803332909299, + "grad_norm": 1.482926607131958, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8546245098114014, + "num_tokens": 167607484.0, + "step": 4391 + }, + { + "epoch": 0.5587075435695205, + "grad_norm": 1.5784013271331787, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8599898219108582, + "num_tokens": 167645596.0, + "step": 4392 + }, + { + "epoch": 0.5588347538481109, + "grad_norm": 1.4930717945098877, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.889243483543396, + "num_tokens": 167679773.0, + "step": 4393 + }, + { + "epoch": 0.5589619641267014, + "grad_norm": 1.4833284616470337, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.879338264465332, + "num_tokens": 167717448.0, + "step": 4394 + }, + { + "epoch": 0.5590891744052919, + "grad_norm": 1.5570385456085205, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8711654543876648, + "num_tokens": 167754938.0, + "step": 4395 + }, + { + "epoch": 0.5592163846838825, + "grad_norm": 1.6032501459121704, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8618290424346924, + "num_tokens": 167791290.0, + "step": 4396 + }, + { + "epoch": 0.559343594962473, + "grad_norm": 1.4243282079696655, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8700721263885498, + "num_tokens": 167831240.0, + "step": 4397 + }, + { + "epoch": 0.5594708052410635, + "grad_norm": 1.5363410711288452, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8588522672653198, + "num_tokens": 167871305.0, + "step": 4398 + }, + { + "epoch": 0.5595980155196539, + "grad_norm": 1.6288334131240845, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8508452773094177, + "num_tokens": 167906508.0, + "step": 4399 + }, + { + "epoch": 0.5597252257982445, + "grad_norm": 1.4993412494659424, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.859687328338623, + "num_tokens": 167943268.0, + "step": 4400 + }, + { + "epoch": 0.559852436076835, + "grad_norm": 1.484403133392334, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8487192392349243, + "num_tokens": 167985117.0, + "step": 4401 + }, + { + "epoch": 0.5599796463554255, + "grad_norm": 1.585573673248291, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8438324928283691, + "num_tokens": 168020212.0, + "step": 4402 + }, + { + "epoch": 0.560106856634016, + "grad_norm": 1.456672191619873, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8619325160980225, + "num_tokens": 168060140.0, + "step": 4403 + }, + { + "epoch": 0.5602340669126066, + "grad_norm": 1.6266238689422607, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8664228916168213, + "num_tokens": 168093644.0, + "step": 4404 + }, + { + "epoch": 0.560361277191197, + "grad_norm": 1.5876892805099487, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8556836843490601, + "num_tokens": 168130061.0, + "step": 4405 + }, + { + "epoch": 0.5604884874697875, + "grad_norm": 1.5371705293655396, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8709677457809448, + "num_tokens": 168165160.0, + "step": 4406 + }, + { + "epoch": 0.560615697748378, + "grad_norm": 1.553317666053772, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8799853324890137, + "num_tokens": 168199427.0, + "step": 4407 + }, + { + "epoch": 0.5607429080269686, + "grad_norm": 1.5420408248901367, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8761062026023865, + "num_tokens": 168231577.0, + "step": 4408 + }, + { + "epoch": 0.5608701183055591, + "grad_norm": 1.542284369468689, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8609951734542847, + "num_tokens": 168266755.0, + "step": 4409 + }, + { + "epoch": 0.5609973285841496, + "grad_norm": 1.5032234191894531, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8610638380050659, + "num_tokens": 168307338.0, + "step": 4410 + }, + { + "epoch": 0.5611245388627402, + "grad_norm": 1.4775030612945557, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.879627525806427, + "num_tokens": 168340805.0, + "step": 4411 + }, + { + "epoch": 0.5612517491413306, + "grad_norm": 1.4617445468902588, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8580463528633118, + "num_tokens": 168378793.0, + "step": 4412 + }, + { + "epoch": 0.5613789594199211, + "grad_norm": 1.459503412246704, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8610965013504028, + "num_tokens": 168420769.0, + "step": 4413 + }, + { + "epoch": 0.5615061696985116, + "grad_norm": 1.514007568359375, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8572430610656738, + "num_tokens": 168458143.0, + "step": 4414 + }, + { + "epoch": 0.5616333799771022, + "grad_norm": 1.5077043771743774, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8629814386367798, + "num_tokens": 168493162.0, + "step": 4415 + }, + { + "epoch": 0.5617605902556927, + "grad_norm": 1.3879719972610474, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8666203022003174, + "num_tokens": 168537058.0, + "step": 4416 + }, + { + "epoch": 0.5618878005342832, + "grad_norm": 1.506866216659546, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8691573143005371, + "num_tokens": 168575371.0, + "step": 4417 + }, + { + "epoch": 0.5620150108128736, + "grad_norm": 1.476367712020874, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.877724826335907, + "num_tokens": 168614142.0, + "step": 4418 + }, + { + "epoch": 0.5621422210914642, + "grad_norm": 1.5279793739318848, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8525869250297546, + "num_tokens": 168650024.0, + "step": 4419 + }, + { + "epoch": 0.5622694313700547, + "grad_norm": 1.4996668100357056, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8621749877929688, + "num_tokens": 168691668.0, + "step": 4420 + }, + { + "epoch": 0.5623966416486452, + "grad_norm": 1.6028188467025757, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8546673655509949, + "num_tokens": 168725845.0, + "step": 4421 + }, + { + "epoch": 0.5625238519272358, + "grad_norm": 1.3947713375091553, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8592500686645508, + "num_tokens": 168767736.0, + "step": 4422 + }, + { + "epoch": 0.5626510622058263, + "grad_norm": 1.55105721950531, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8539314270019531, + "num_tokens": 168806834.0, + "step": 4423 + }, + { + "epoch": 0.5627782724844167, + "grad_norm": 1.5946753025054932, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8634403347969055, + "num_tokens": 168841409.0, + "step": 4424 + }, + { + "epoch": 0.5629054827630072, + "grad_norm": 1.4649858474731445, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8592886924743652, + "num_tokens": 168880419.0, + "step": 4425 + }, + { + "epoch": 0.5630326930415978, + "grad_norm": 1.4963507652282715, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8537707328796387, + "num_tokens": 168918269.0, + "step": 4426 + }, + { + "epoch": 0.5631599033201883, + "grad_norm": 1.5230083465576172, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8601495027542114, + "num_tokens": 168956209.0, + "step": 4427 + }, + { + "epoch": 0.5632871135987788, + "grad_norm": 1.492891788482666, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8619649410247803, + "num_tokens": 168991288.0, + "step": 4428 + }, + { + "epoch": 0.5634143238773693, + "grad_norm": 1.451310157775879, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.874222993850708, + "num_tokens": 169028287.0, + "step": 4429 + }, + { + "epoch": 0.5635415341559598, + "grad_norm": 1.5155041217803955, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8737529516220093, + "num_tokens": 169064657.0, + "step": 4430 + }, + { + "epoch": 0.5636687444345503, + "grad_norm": 1.4070510864257812, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8790822625160217, + "num_tokens": 169103721.0, + "step": 4431 + }, + { + "epoch": 0.5637959547131408, + "grad_norm": 1.590370535850525, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8665316104888916, + "num_tokens": 169136175.0, + "step": 4432 + }, + { + "epoch": 0.5639231649917313, + "grad_norm": 1.545852780342102, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8697521686553955, + "num_tokens": 169170110.0, + "step": 4433 + }, + { + "epoch": 0.5640503752703219, + "grad_norm": 1.4244643449783325, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8733679056167603, + "num_tokens": 169213178.0, + "step": 4434 + }, + { + "epoch": 0.5641775855489124, + "grad_norm": 1.409195065498352, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8722048997879028, + "num_tokens": 169253544.0, + "step": 4435 + }, + { + "epoch": 0.5643047958275028, + "grad_norm": 1.526951551437378, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8628835082054138, + "num_tokens": 169291552.0, + "step": 4436 + }, + { + "epoch": 0.5644320061060933, + "grad_norm": 1.3116278648376465, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8892462253570557, + "num_tokens": 169332441.0, + "step": 4437 + }, + { + "epoch": 0.5645592163846839, + "grad_norm": 1.5001415014266968, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8424657583236694, + "num_tokens": 169374900.0, + "step": 4438 + }, + { + "epoch": 0.5646864266632744, + "grad_norm": 1.450577974319458, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8723240494728088, + "num_tokens": 169414905.0, + "step": 4439 + }, + { + "epoch": 0.5648136369418649, + "grad_norm": 1.4687135219573975, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8690697550773621, + "num_tokens": 169453846.0, + "step": 4440 + }, + { + "epoch": 0.5649408472204555, + "grad_norm": 1.4953784942626953, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8828420639038086, + "num_tokens": 169489925.0, + "step": 4441 + }, + { + "epoch": 0.5650680574990459, + "grad_norm": 1.600831151008606, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8529518842697144, + "num_tokens": 169527675.0, + "step": 4442 + }, + { + "epoch": 0.5651952677776364, + "grad_norm": 1.5544580221176147, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8640074729919434, + "num_tokens": 169563221.0, + "step": 4443 + }, + { + "epoch": 0.5653224780562269, + "grad_norm": 1.6785976886749268, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8502727746963501, + "num_tokens": 169600038.0, + "step": 4444 + }, + { + "epoch": 0.5654496883348175, + "grad_norm": 1.5967662334442139, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.858522891998291, + "num_tokens": 169637513.0, + "step": 4445 + }, + { + "epoch": 0.565576898613408, + "grad_norm": 1.6391277313232422, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.850376307964325, + "num_tokens": 169672375.0, + "step": 4446 + }, + { + "epoch": 0.5657041088919985, + "grad_norm": 1.4661426544189453, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8623175621032715, + "num_tokens": 169716801.0, + "step": 4447 + }, + { + "epoch": 0.5658313191705889, + "grad_norm": 1.6271979808807373, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8637477159500122, + "num_tokens": 169751058.0, + "step": 4448 + }, + { + "epoch": 0.5659585294491795, + "grad_norm": 1.3885034322738647, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.872989296913147, + "num_tokens": 169794841.0, + "step": 4449 + }, + { + "epoch": 0.56608573972777, + "grad_norm": 1.51858389377594, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8620919585227966, + "num_tokens": 169837952.0, + "step": 4450 + }, + { + "epoch": 0.5662129500063605, + "grad_norm": 1.4287278652191162, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8562391996383667, + "num_tokens": 169879681.0, + "step": 4451 + }, + { + "epoch": 0.566340160284951, + "grad_norm": 1.5327447652816772, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8657444715499878, + "num_tokens": 169915101.0, + "step": 4452 + }, + { + "epoch": 0.5664673705635416, + "grad_norm": 1.627061128616333, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8652816414833069, + "num_tokens": 169949162.0, + "step": 4453 + }, + { + "epoch": 0.566594580842132, + "grad_norm": 1.4856336116790771, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8541631102561951, + "num_tokens": 169990361.0, + "step": 4454 + }, + { + "epoch": 0.5667217911207225, + "grad_norm": 1.3535821437835693, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8775359392166138, + "num_tokens": 170031872.0, + "step": 4455 + }, + { + "epoch": 0.566849001399313, + "grad_norm": 1.5842719078063965, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8491865396499634, + "num_tokens": 170067405.0, + "step": 4456 + }, + { + "epoch": 0.5669762116779036, + "grad_norm": 1.409374475479126, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8676512241363525, + "num_tokens": 170109888.0, + "step": 4457 + }, + { + "epoch": 0.5671034219564941, + "grad_norm": 1.648485779762268, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8504256010055542, + "num_tokens": 170145085.0, + "step": 4458 + }, + { + "epoch": 0.5672306322350846, + "grad_norm": 1.5007550716400146, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8672928810119629, + "num_tokens": 170182497.0, + "step": 4459 + }, + { + "epoch": 0.5673578425136752, + "grad_norm": 1.7047410011291504, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8474463224411011, + "num_tokens": 170217204.0, + "step": 4460 + }, + { + "epoch": 0.5674850527922656, + "grad_norm": 2.0876009464263916, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8590081930160522, + "num_tokens": 170250927.0, + "step": 4461 + }, + { + "epoch": 0.5676122630708561, + "grad_norm": 1.982175350189209, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8658761978149414, + "num_tokens": 170285530.0, + "step": 4462 + }, + { + "epoch": 0.5677394733494466, + "grad_norm": 1.5637059211730957, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8406274914741516, + "num_tokens": 170324231.0, + "step": 4463 + }, + { + "epoch": 0.5678666836280372, + "grad_norm": 1.5631657838821411, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8517554998397827, + "num_tokens": 170360799.0, + "step": 4464 + }, + { + "epoch": 0.5679938939066277, + "grad_norm": 1.4462928771972656, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8631897568702698, + "num_tokens": 170402197.0, + "step": 4465 + }, + { + "epoch": 0.5681211041852182, + "grad_norm": 1.7038379907608032, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8636025190353394, + "num_tokens": 170435887.0, + "step": 4466 + }, + { + "epoch": 0.5682483144638086, + "grad_norm": 1.5188924074172974, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8645524382591248, + "num_tokens": 170470331.0, + "step": 4467 + }, + { + "epoch": 0.5683755247423992, + "grad_norm": 1.5713565349578857, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8746743202209473, + "num_tokens": 170506809.0, + "step": 4468 + }, + { + "epoch": 0.5685027350209897, + "grad_norm": 1.521126627922058, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8755381107330322, + "num_tokens": 170541499.0, + "step": 4469 + }, + { + "epoch": 0.5686299452995802, + "grad_norm": 1.43986976146698, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8578876256942749, + "num_tokens": 170583685.0, + "step": 4470 + }, + { + "epoch": 0.5687571555781707, + "grad_norm": 1.6356751918792725, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8551249504089355, + "num_tokens": 170617649.0, + "step": 4471 + }, + { + "epoch": 0.5688843658567613, + "grad_norm": 1.6054433584213257, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8660076856613159, + "num_tokens": 170649728.0, + "step": 4472 + }, + { + "epoch": 0.5690115761353517, + "grad_norm": 1.4677386283874512, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.876309871673584, + "num_tokens": 170689346.0, + "step": 4473 + }, + { + "epoch": 0.5691387864139422, + "grad_norm": 1.5473148822784424, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8731988668441772, + "num_tokens": 170725009.0, + "step": 4474 + }, + { + "epoch": 0.5692659966925327, + "grad_norm": 1.5331248044967651, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8623380661010742, + "num_tokens": 170760707.0, + "step": 4475 + }, + { + "epoch": 0.5693932069711233, + "grad_norm": 1.6349796056747437, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8672228455543518, + "num_tokens": 170793967.0, + "step": 4476 + }, + { + "epoch": 0.5695204172497138, + "grad_norm": 1.5121396780014038, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8618925213813782, + "num_tokens": 170829883.0, + "step": 4477 + }, + { + "epoch": 0.5696476275283043, + "grad_norm": 1.4670016765594482, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8766090869903564, + "num_tokens": 170867922.0, + "step": 4478 + }, + { + "epoch": 0.5697748378068948, + "grad_norm": 1.3263579607009888, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8661494255065918, + "num_tokens": 170915043.0, + "step": 4479 + }, + { + "epoch": 0.5699020480854853, + "grad_norm": 1.5996949672698975, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.859419584274292, + "num_tokens": 170952122.0, + "step": 4480 + }, + { + "epoch": 0.5700292583640758, + "grad_norm": 1.4978464841842651, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8634238839149475, + "num_tokens": 170987715.0, + "step": 4481 + }, + { + "epoch": 0.5701564686426663, + "grad_norm": 1.4254333972930908, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8599885702133179, + "num_tokens": 171030584.0, + "step": 4482 + }, + { + "epoch": 0.5702836789212569, + "grad_norm": 1.5252680778503418, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8711371421813965, + "num_tokens": 171067576.0, + "step": 4483 + }, + { + "epoch": 0.5704108891998474, + "grad_norm": 1.5176595449447632, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8738877773284912, + "num_tokens": 171105221.0, + "step": 4484 + }, + { + "epoch": 0.5705380994784378, + "grad_norm": 1.4918359518051147, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8682781457901001, + "num_tokens": 171143823.0, + "step": 4485 + }, + { + "epoch": 0.5706653097570283, + "grad_norm": 1.4697948694229126, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8779657483100891, + "num_tokens": 171177543.0, + "step": 4486 + }, + { + "epoch": 0.5707925200356189, + "grad_norm": 1.5005987882614136, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8541243076324463, + "num_tokens": 171220973.0, + "step": 4487 + }, + { + "epoch": 0.5709197303142094, + "grad_norm": 1.6323696374893188, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8677760362625122, + "num_tokens": 171255230.0, + "step": 4488 + }, + { + "epoch": 0.5710469405927999, + "grad_norm": 1.5872913599014282, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8640506267547607, + "num_tokens": 171292369.0, + "step": 4489 + }, + { + "epoch": 0.5711741508713905, + "grad_norm": 1.617362141609192, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8756150603294373, + "num_tokens": 171323151.0, + "step": 4490 + }, + { + "epoch": 0.5713013611499809, + "grad_norm": 1.3800541162490845, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8791260719299316, + "num_tokens": 171363273.0, + "step": 4491 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 1.5661920309066772, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8697943091392517, + "num_tokens": 171400893.0, + "step": 4492 + }, + { + "epoch": 0.5715557817071619, + "grad_norm": 1.414503812789917, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8671399354934692, + "num_tokens": 171441766.0, + "step": 4493 + }, + { + "epoch": 0.5716829919857525, + "grad_norm": 1.599252700805664, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8625898361206055, + "num_tokens": 171476128.0, + "step": 4494 + }, + { + "epoch": 0.571810202264343, + "grad_norm": 1.6284476518630981, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8603708148002625, + "num_tokens": 171511624.0, + "step": 4495 + }, + { + "epoch": 0.5719374125429335, + "grad_norm": 1.4019486904144287, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8776026964187622, + "num_tokens": 171550665.0, + "step": 4496 + }, + { + "epoch": 0.5720646228215239, + "grad_norm": 1.6006789207458496, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8683449029922485, + "num_tokens": 171583853.0, + "step": 4497 + }, + { + "epoch": 0.5721918331001145, + "grad_norm": 1.4509525299072266, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8725371360778809, + "num_tokens": 171622305.0, + "step": 4498 + }, + { + "epoch": 0.572319043378705, + "grad_norm": 1.4425910711288452, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8640743494033813, + "num_tokens": 171662879.0, + "step": 4499 + }, + { + "epoch": 0.5724462536572955, + "grad_norm": 1.4250831604003906, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.859281063079834, + "num_tokens": 171702735.0, + "step": 4500 + }, + { + "epoch": 0.572573463935886, + "grad_norm": 1.5645641088485718, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8584368228912354, + "num_tokens": 171737977.0, + "step": 4501 + }, + { + "epoch": 0.5727006742144766, + "grad_norm": 1.4451804161071777, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8593093752861023, + "num_tokens": 171777392.0, + "step": 4502 + }, + { + "epoch": 0.572827884493067, + "grad_norm": 1.4207708835601807, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8636888265609741, + "num_tokens": 171819601.0, + "step": 4503 + }, + { + "epoch": 0.5729550947716575, + "grad_norm": 1.4538283348083496, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8728584051132202, + "num_tokens": 171853510.0, + "step": 4504 + }, + { + "epoch": 0.573082305050248, + "grad_norm": 1.6954880952835083, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8622655868530273, + "num_tokens": 171888437.0, + "step": 4505 + }, + { + "epoch": 0.5732095153288386, + "grad_norm": 1.5490232706069946, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8590625524520874, + "num_tokens": 171927883.0, + "step": 4506 + }, + { + "epoch": 0.5733367256074291, + "grad_norm": 1.482858657836914, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8656501770019531, + "num_tokens": 171966081.0, + "step": 4507 + }, + { + "epoch": 0.5734639358860196, + "grad_norm": 1.6156514883041382, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8702596426010132, + "num_tokens": 171997152.0, + "step": 4508 + }, + { + "epoch": 0.5735911461646102, + "grad_norm": 1.4752347469329834, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8596630692481995, + "num_tokens": 172034765.0, + "step": 4509 + }, + { + "epoch": 0.5737183564432006, + "grad_norm": 1.4638112783432007, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8696924448013306, + "num_tokens": 172073399.0, + "step": 4510 + }, + { + "epoch": 0.5738455667217911, + "grad_norm": 1.4637459516525269, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8652840852737427, + "num_tokens": 172111059.0, + "step": 4511 + }, + { + "epoch": 0.5739727770003816, + "grad_norm": 1.5729737281799316, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8689221739768982, + "num_tokens": 172148169.0, + "step": 4512 + }, + { + "epoch": 0.5740999872789722, + "grad_norm": 1.5085560083389282, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8633785843849182, + "num_tokens": 172187344.0, + "step": 4513 + }, + { + "epoch": 0.5742271975575627, + "grad_norm": 1.4413161277770996, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8635255098342896, + "num_tokens": 172225270.0, + "step": 4514 + }, + { + "epoch": 0.5743544078361532, + "grad_norm": 1.4576265811920166, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8729169964790344, + "num_tokens": 172261990.0, + "step": 4515 + }, + { + "epoch": 0.5744816181147436, + "grad_norm": 1.608350157737732, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8632656931877136, + "num_tokens": 172295379.0, + "step": 4516 + }, + { + "epoch": 0.5746088283933342, + "grad_norm": 1.5172568559646606, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8601207733154297, + "num_tokens": 172336473.0, + "step": 4517 + }, + { + "epoch": 0.5747360386719247, + "grad_norm": 1.49824857711792, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8593487739562988, + "num_tokens": 172374581.0, + "step": 4518 + }, + { + "epoch": 0.5748632489505152, + "grad_norm": 1.4823075532913208, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8537662625312805, + "num_tokens": 172413782.0, + "step": 4519 + }, + { + "epoch": 0.5749904592291057, + "grad_norm": 1.5535435676574707, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8742233514785767, + "num_tokens": 172450173.0, + "step": 4520 + }, + { + "epoch": 0.5751176695076963, + "grad_norm": 1.5166019201278687, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8689669370651245, + "num_tokens": 172490410.0, + "step": 4521 + }, + { + "epoch": 0.5752448797862867, + "grad_norm": 1.4321211576461792, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8775891065597534, + "num_tokens": 172528205.0, + "step": 4522 + }, + { + "epoch": 0.5753720900648772, + "grad_norm": 1.566442608833313, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8643674850463867, + "num_tokens": 172565902.0, + "step": 4523 + }, + { + "epoch": 0.5754993003434677, + "grad_norm": 1.5200783014297485, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8639097213745117, + "num_tokens": 172603404.0, + "step": 4524 + }, + { + "epoch": 0.5756265106220583, + "grad_norm": 1.4763706922531128, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8635343313217163, + "num_tokens": 172643626.0, + "step": 4525 + }, + { + "epoch": 0.5757537209006488, + "grad_norm": 1.6364668607711792, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8630537986755371, + "num_tokens": 172676248.0, + "step": 4526 + }, + { + "epoch": 0.5758809311792393, + "grad_norm": 1.5756899118423462, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8626149296760559, + "num_tokens": 172712669.0, + "step": 4527 + }, + { + "epoch": 0.5760081414578297, + "grad_norm": 1.4169427156448364, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8686625957489014, + "num_tokens": 172751778.0, + "step": 4528 + }, + { + "epoch": 0.5761353517364203, + "grad_norm": 1.5310149192810059, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8684614300727844, + "num_tokens": 172790537.0, + "step": 4529 + }, + { + "epoch": 0.5762625620150108, + "grad_norm": 1.4306714534759521, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8644703030586243, + "num_tokens": 172832242.0, + "step": 4530 + }, + { + "epoch": 0.5763897722936013, + "grad_norm": 1.5612715482711792, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8709746599197388, + "num_tokens": 172869348.0, + "step": 4531 + }, + { + "epoch": 0.5765169825721919, + "grad_norm": 1.3789182901382446, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8764665126800537, + "num_tokens": 172911774.0, + "step": 4532 + }, + { + "epoch": 0.5766441928507824, + "grad_norm": 1.4288558959960938, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8696167469024658, + "num_tokens": 172951342.0, + "step": 4533 + }, + { + "epoch": 0.5767714031293728, + "grad_norm": 1.543434739112854, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8648358583450317, + "num_tokens": 172991160.0, + "step": 4534 + }, + { + "epoch": 0.5768986134079633, + "grad_norm": 1.4796669483184814, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8594516515731812, + "num_tokens": 173032151.0, + "step": 4535 + }, + { + "epoch": 0.5770258236865539, + "grad_norm": 1.4294662475585938, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8733198046684265, + "num_tokens": 173069956.0, + "step": 4536 + }, + { + "epoch": 0.5771530339651444, + "grad_norm": 1.6001858711242676, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8760665655136108, + "num_tokens": 173099334.0, + "step": 4537 + }, + { + "epoch": 0.5772802442437349, + "grad_norm": 1.5810582637786865, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.87067711353302, + "num_tokens": 173133231.0, + "step": 4538 + }, + { + "epoch": 0.5774074545223254, + "grad_norm": 1.4969203472137451, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8843631744384766, + "num_tokens": 173168338.0, + "step": 4539 + }, + { + "epoch": 0.5775346648009159, + "grad_norm": 1.4418344497680664, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8554481863975525, + "num_tokens": 173207854.0, + "step": 4540 + }, + { + "epoch": 0.5776618750795064, + "grad_norm": 1.5545753240585327, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.847787618637085, + "num_tokens": 173250600.0, + "step": 4541 + }, + { + "epoch": 0.5777890853580969, + "grad_norm": 1.566948413848877, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8521588444709778, + "num_tokens": 173286981.0, + "step": 4542 + }, + { + "epoch": 0.5779162956366874, + "grad_norm": 1.5279157161712646, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8457188010215759, + "num_tokens": 173330191.0, + "step": 4543 + }, + { + "epoch": 0.578043505915278, + "grad_norm": 1.592023253440857, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8578906059265137, + "num_tokens": 173370700.0, + "step": 4544 + }, + { + "epoch": 0.5781707161938685, + "grad_norm": 1.4914551973342896, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.872291088104248, + "num_tokens": 173411681.0, + "step": 4545 + }, + { + "epoch": 0.5782979264724589, + "grad_norm": 1.6118524074554443, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8567193746566772, + "num_tokens": 173447294.0, + "step": 4546 + }, + { + "epoch": 0.5784251367510495, + "grad_norm": 1.5006554126739502, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8707554936408997, + "num_tokens": 173484267.0, + "step": 4547 + }, + { + "epoch": 0.57855234702964, + "grad_norm": 1.5575153827667236, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.848085880279541, + "num_tokens": 173524419.0, + "step": 4548 + }, + { + "epoch": 0.5786795573082305, + "grad_norm": 1.735714077949524, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8695129156112671, + "num_tokens": 173557029.0, + "step": 4549 + }, + { + "epoch": 0.578806767586821, + "grad_norm": 1.4465389251708984, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8561794757843018, + "num_tokens": 173602481.0, + "step": 4550 + }, + { + "epoch": 0.5789339778654116, + "grad_norm": 1.559095859527588, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8803682327270508, + "num_tokens": 173639074.0, + "step": 4551 + }, + { + "epoch": 0.579061188144002, + "grad_norm": 1.642388105392456, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8546488285064697, + "num_tokens": 173674339.0, + "step": 4552 + }, + { + "epoch": 0.5791883984225925, + "grad_norm": 1.4232308864593506, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8835222721099854, + "num_tokens": 173709198.0, + "step": 4553 + }, + { + "epoch": 0.579315608701183, + "grad_norm": 1.6301500797271729, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.862028181552887, + "num_tokens": 173740983.0, + "step": 4554 + }, + { + "epoch": 0.5794428189797736, + "grad_norm": 1.462418556213379, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8500012159347534, + "num_tokens": 173782873.0, + "step": 4555 + }, + { + "epoch": 0.5795700292583641, + "grad_norm": 1.5006787776947021, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8747333288192749, + "num_tokens": 173821453.0, + "step": 4556 + }, + { + "epoch": 0.5796972395369546, + "grad_norm": 1.4452238082885742, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8661136031150818, + "num_tokens": 173860622.0, + "step": 4557 + }, + { + "epoch": 0.5798244498155452, + "grad_norm": 1.6232151985168457, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8636779189109802, + "num_tokens": 173895194.0, + "step": 4558 + }, + { + "epoch": 0.5799516600941356, + "grad_norm": 1.6450141668319702, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8649927377700806, + "num_tokens": 173928390.0, + "step": 4559 + }, + { + "epoch": 0.5800788703727261, + "grad_norm": 1.5925157070159912, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8606389760971069, + "num_tokens": 173969201.0, + "step": 4560 + }, + { + "epoch": 0.5802060806513166, + "grad_norm": 1.6533881425857544, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8780359625816345, + "num_tokens": 173999362.0, + "step": 4561 + }, + { + "epoch": 0.5803332909299072, + "grad_norm": 1.5681769847869873, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8608640432357788, + "num_tokens": 174037576.0, + "step": 4562 + }, + { + "epoch": 0.5804605012084977, + "grad_norm": 1.569820761680603, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8692026138305664, + "num_tokens": 174071439.0, + "step": 4563 + }, + { + "epoch": 0.5805877114870882, + "grad_norm": 1.4918328523635864, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8656538724899292, + "num_tokens": 174108653.0, + "step": 4564 + }, + { + "epoch": 0.5807149217656786, + "grad_norm": 1.4922573566436768, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8611981272697449, + "num_tokens": 174149710.0, + "step": 4565 + }, + { + "epoch": 0.5808421320442692, + "grad_norm": 1.4874870777130127, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8734676837921143, + "num_tokens": 174185407.0, + "step": 4566 + }, + { + "epoch": 0.5809693423228597, + "grad_norm": 1.3951380252838135, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.871712327003479, + "num_tokens": 174228129.0, + "step": 4567 + }, + { + "epoch": 0.5810965526014502, + "grad_norm": 1.6098506450653076, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8590201735496521, + "num_tokens": 174265774.0, + "step": 4568 + }, + { + "epoch": 0.5812237628800407, + "grad_norm": 1.4867020845413208, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.867878794670105, + "num_tokens": 174304319.0, + "step": 4569 + }, + { + "epoch": 0.5813509731586313, + "grad_norm": 1.4853051900863647, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.860867977142334, + "num_tokens": 174343161.0, + "step": 4570 + }, + { + "epoch": 0.5814781834372217, + "grad_norm": 1.4331638813018799, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8722038269042969, + "num_tokens": 174381802.0, + "step": 4571 + }, + { + "epoch": 0.5816053937158122, + "grad_norm": 1.519489049911499, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8614855408668518, + "num_tokens": 174418570.0, + "step": 4572 + }, + { + "epoch": 0.5817326039944027, + "grad_norm": 1.4223897457122803, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8686889410018921, + "num_tokens": 174462962.0, + "step": 4573 + }, + { + "epoch": 0.5818598142729933, + "grad_norm": 1.5080851316452026, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.876311182975769, + "num_tokens": 174498146.0, + "step": 4574 + }, + { + "epoch": 0.5819870245515838, + "grad_norm": 1.6480598449707031, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.863601565361023, + "num_tokens": 174529595.0, + "step": 4575 + }, + { + "epoch": 0.5821142348301743, + "grad_norm": 1.6292997598648071, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8632451295852661, + "num_tokens": 174566469.0, + "step": 4576 + }, + { + "epoch": 0.5822414451087647, + "grad_norm": 1.4782047271728516, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8599199652671814, + "num_tokens": 174607651.0, + "step": 4577 + }, + { + "epoch": 0.5823686553873553, + "grad_norm": 1.4984105825424194, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8712174892425537, + "num_tokens": 174647786.0, + "step": 4578 + }, + { + "epoch": 0.5824958656659458, + "grad_norm": 1.5954806804656982, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8568439483642578, + "num_tokens": 174683057.0, + "step": 4579 + }, + { + "epoch": 0.5826230759445363, + "grad_norm": 1.5624210834503174, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8547098636627197, + "num_tokens": 174720324.0, + "step": 4580 + }, + { + "epoch": 0.5827502862231269, + "grad_norm": 1.56748366355896, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8556170463562012, + "num_tokens": 174758785.0, + "step": 4581 + }, + { + "epoch": 0.5828774965017174, + "grad_norm": 1.5345004796981812, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8532271385192871, + "num_tokens": 174797699.0, + "step": 4582 + }, + { + "epoch": 0.5830047067803078, + "grad_norm": 1.4110101461410522, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8709213137626648, + "num_tokens": 174839816.0, + "step": 4583 + }, + { + "epoch": 0.5831319170588983, + "grad_norm": 1.4659079313278198, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8671857118606567, + "num_tokens": 174880144.0, + "step": 4584 + }, + { + "epoch": 0.5832591273374889, + "grad_norm": 1.589680790901184, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8650304079055786, + "num_tokens": 174914362.0, + "step": 4585 + }, + { + "epoch": 0.5833863376160794, + "grad_norm": 1.3526328802108765, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8684092164039612, + "num_tokens": 174959254.0, + "step": 4586 + }, + { + "epoch": 0.5835135478946699, + "grad_norm": 1.5559213161468506, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8623563051223755, + "num_tokens": 174998010.0, + "step": 4587 + }, + { + "epoch": 0.5836407581732604, + "grad_norm": 1.4013622999191284, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8691514134407043, + "num_tokens": 175043058.0, + "step": 4588 + }, + { + "epoch": 0.5837679684518509, + "grad_norm": 1.4166395664215088, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8699151873588562, + "num_tokens": 175081749.0, + "step": 4589 + }, + { + "epoch": 0.5838951787304414, + "grad_norm": 1.554442286491394, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8594571948051453, + "num_tokens": 175119903.0, + "step": 4590 + }, + { + "epoch": 0.5840223890090319, + "grad_norm": 1.573796033859253, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8607369661331177, + "num_tokens": 175153482.0, + "step": 4591 + }, + { + "epoch": 0.5841495992876224, + "grad_norm": 1.457779884338379, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8707565665245056, + "num_tokens": 175193652.0, + "step": 4592 + }, + { + "epoch": 0.584276809566213, + "grad_norm": 1.5439279079437256, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8528559803962708, + "num_tokens": 175236119.0, + "step": 4593 + }, + { + "epoch": 0.5844040198448035, + "grad_norm": 1.5565201044082642, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8576961159706116, + "num_tokens": 175271951.0, + "step": 4594 + }, + { + "epoch": 0.5845312301233939, + "grad_norm": 1.4850609302520752, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8668050765991211, + "num_tokens": 175308680.0, + "step": 4595 + }, + { + "epoch": 0.5846584404019844, + "grad_norm": 1.4260176420211792, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8635324239730835, + "num_tokens": 175348197.0, + "step": 4596 + }, + { + "epoch": 0.584785650680575, + "grad_norm": 1.4613550901412964, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8587636947631836, + "num_tokens": 175388886.0, + "step": 4597 + }, + { + "epoch": 0.5849128609591655, + "grad_norm": 1.5242544412612915, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8692855834960938, + "num_tokens": 175427264.0, + "step": 4598 + }, + { + "epoch": 0.585040071237756, + "grad_norm": 1.3069382905960083, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.872144341468811, + "num_tokens": 175470733.0, + "step": 4599 + }, + { + "epoch": 0.5851672815163466, + "grad_norm": 1.4441049098968506, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8699861764907837, + "num_tokens": 175508502.0, + "step": 4600 + }, + { + "epoch": 0.585294491794937, + "grad_norm": 1.5199590921401978, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8630372285842896, + "num_tokens": 175546068.0, + "step": 4601 + }, + { + "epoch": 0.5854217020735275, + "grad_norm": 1.5371419191360474, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8681614995002747, + "num_tokens": 175580998.0, + "step": 4602 + }, + { + "epoch": 0.585548912352118, + "grad_norm": 1.4668312072753906, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8560339212417603, + "num_tokens": 175619660.0, + "step": 4603 + }, + { + "epoch": 0.5856761226307086, + "grad_norm": 1.6005860567092896, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.852064847946167, + "num_tokens": 175656513.0, + "step": 4604 + }, + { + "epoch": 0.5858033329092991, + "grad_norm": 1.4495676755905151, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8654114007949829, + "num_tokens": 175695778.0, + "step": 4605 + }, + { + "epoch": 0.5859305431878896, + "grad_norm": 1.6258254051208496, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8603798151016235, + "num_tokens": 175729235.0, + "step": 4606 + }, + { + "epoch": 0.5860577534664801, + "grad_norm": 1.5082147121429443, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.873680055141449, + "num_tokens": 175764214.0, + "step": 4607 + }, + { + "epoch": 0.5861849637450706, + "grad_norm": 1.4087365865707397, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8658868074417114, + "num_tokens": 175809538.0, + "step": 4608 + }, + { + "epoch": 0.5863121740236611, + "grad_norm": 1.556862711906433, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8420822620391846, + "num_tokens": 175847729.0, + "step": 4609 + }, + { + "epoch": 0.5864393843022516, + "grad_norm": 1.5304337739944458, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8622077703475952, + "num_tokens": 175882675.0, + "step": 4610 + }, + { + "epoch": 0.5865665945808421, + "grad_norm": 1.4310811758041382, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8799102306365967, + "num_tokens": 175920265.0, + "step": 4611 + }, + { + "epoch": 0.5866938048594327, + "grad_norm": 1.5937904119491577, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8586620092391968, + "num_tokens": 175957780.0, + "step": 4612 + }, + { + "epoch": 0.5868210151380232, + "grad_norm": 1.7853477001190186, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8696901798248291, + "num_tokens": 175988268.0, + "step": 4613 + }, + { + "epoch": 0.5869482254166136, + "grad_norm": 1.45501708984375, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8711540102958679, + "num_tokens": 176029554.0, + "step": 4614 + }, + { + "epoch": 0.5870754356952042, + "grad_norm": 1.5284132957458496, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8641906976699829, + "num_tokens": 176065230.0, + "step": 4615 + }, + { + "epoch": 0.5872026459737947, + "grad_norm": 1.8431015014648438, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8680256009101868, + "num_tokens": 176097938.0, + "step": 4616 + }, + { + "epoch": 0.5873298562523852, + "grad_norm": 1.7170586585998535, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8417093753814697, + "num_tokens": 176129472.0, + "step": 4617 + }, + { + "epoch": 0.5874570665309757, + "grad_norm": 1.618641972541809, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8612082004547119, + "num_tokens": 176165549.0, + "step": 4618 + }, + { + "epoch": 0.5875842768095663, + "grad_norm": 1.6502631902694702, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8603194952011108, + "num_tokens": 176204381.0, + "step": 4619 + }, + { + "epoch": 0.5877114870881567, + "grad_norm": 1.469711422920227, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8621439337730408, + "num_tokens": 176244162.0, + "step": 4620 + }, + { + "epoch": 0.5878386973667472, + "grad_norm": 1.4387998580932617, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8551113605499268, + "num_tokens": 176288209.0, + "step": 4621 + }, + { + "epoch": 0.5879659076453377, + "grad_norm": 1.4739612340927124, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8731685876846313, + "num_tokens": 176327126.0, + "step": 4622 + }, + { + "epoch": 0.5880931179239283, + "grad_norm": 1.4841227531433105, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8780930042266846, + "num_tokens": 176364155.0, + "step": 4623 + }, + { + "epoch": 0.5882203282025188, + "grad_norm": 1.5393236875534058, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8783523440361023, + "num_tokens": 176398180.0, + "step": 4624 + }, + { + "epoch": 0.5883475384811093, + "grad_norm": 1.4749948978424072, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8640866875648499, + "num_tokens": 176441474.0, + "step": 4625 + }, + { + "epoch": 0.5884747487596997, + "grad_norm": 1.465678095817566, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8595818281173706, + "num_tokens": 176482973.0, + "step": 4626 + }, + { + "epoch": 0.5886019590382903, + "grad_norm": 1.6468501091003418, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8665091395378113, + "num_tokens": 176517629.0, + "step": 4627 + }, + { + "epoch": 0.5887291693168808, + "grad_norm": 1.4710290431976318, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8628765940666199, + "num_tokens": 176554551.0, + "step": 4628 + }, + { + "epoch": 0.5888563795954713, + "grad_norm": 1.5378044843673706, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8630409240722656, + "num_tokens": 176592192.0, + "step": 4629 + }, + { + "epoch": 0.5889835898740619, + "grad_norm": 1.451309084892273, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8733752369880676, + "num_tokens": 176628389.0, + "step": 4630 + }, + { + "epoch": 0.5891108001526524, + "grad_norm": 1.4390050172805786, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8618475794792175, + "num_tokens": 176672729.0, + "step": 4631 + }, + { + "epoch": 0.5892380104312428, + "grad_norm": 1.4255534410476685, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8505961894989014, + "num_tokens": 176715562.0, + "step": 4632 + }, + { + "epoch": 0.5893652207098333, + "grad_norm": 1.518648386001587, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8730515241622925, + "num_tokens": 176750508.0, + "step": 4633 + }, + { + "epoch": 0.5894924309884239, + "grad_norm": 1.5994446277618408, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8591312766075134, + "num_tokens": 176785307.0, + "step": 4634 + }, + { + "epoch": 0.5896196412670144, + "grad_norm": 1.5576688051223755, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8745693564414978, + "num_tokens": 176820490.0, + "step": 4635 + }, + { + "epoch": 0.5897468515456049, + "grad_norm": 1.456507921218872, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8646877408027649, + "num_tokens": 176860414.0, + "step": 4636 + }, + { + "epoch": 0.5898740618241954, + "grad_norm": 1.507477045059204, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.869139552116394, + "num_tokens": 176899551.0, + "step": 4637 + }, + { + "epoch": 0.5900012721027859, + "grad_norm": 1.582169532775879, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8702212572097778, + "num_tokens": 176932608.0, + "step": 4638 + }, + { + "epoch": 0.5901284823813764, + "grad_norm": 1.5275778770446777, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8464977741241455, + "num_tokens": 176975487.0, + "step": 4639 + }, + { + "epoch": 0.5902556926599669, + "grad_norm": 1.4477782249450684, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8706750869750977, + "num_tokens": 177013236.0, + "step": 4640 + }, + { + "epoch": 0.5903829029385574, + "grad_norm": 1.4566452503204346, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8564985990524292, + "num_tokens": 177053637.0, + "step": 4641 + }, + { + "epoch": 0.590510113217148, + "grad_norm": 1.5116443634033203, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8642207384109497, + "num_tokens": 177093581.0, + "step": 4642 + }, + { + "epoch": 0.5906373234957385, + "grad_norm": 1.5063209533691406, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8733372688293457, + "num_tokens": 177129782.0, + "step": 4643 + }, + { + "epoch": 0.5907645337743289, + "grad_norm": 1.5730715990066528, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8606796264648438, + "num_tokens": 177168240.0, + "step": 4644 + }, + { + "epoch": 0.5908917440529194, + "grad_norm": 1.4546931982040405, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8598772287368774, + "num_tokens": 177211743.0, + "step": 4645 + }, + { + "epoch": 0.59101895433151, + "grad_norm": 1.5160951614379883, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8709492087364197, + "num_tokens": 177250784.0, + "step": 4646 + }, + { + "epoch": 0.5911461646101005, + "grad_norm": 1.4772809743881226, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.863280177116394, + "num_tokens": 177289354.0, + "step": 4647 + }, + { + "epoch": 0.591273374888691, + "grad_norm": 1.688114881515503, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.866541862487793, + "num_tokens": 177324649.0, + "step": 4648 + }, + { + "epoch": 0.5914005851672816, + "grad_norm": 1.5104695558547974, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8714219927787781, + "num_tokens": 177365281.0, + "step": 4649 + }, + { + "epoch": 0.591527795445872, + "grad_norm": 1.5603046417236328, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8670520186424255, + "num_tokens": 177401678.0, + "step": 4650 + }, + { + "epoch": 0.5916550057244625, + "grad_norm": 1.5011036396026611, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8399985432624817, + "num_tokens": 177442374.0, + "step": 4651 + }, + { + "epoch": 0.591782216003053, + "grad_norm": 1.5644320249557495, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8702531456947327, + "num_tokens": 177477825.0, + "step": 4652 + }, + { + "epoch": 0.5919094262816436, + "grad_norm": 1.4411927461624146, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8711543083190918, + "num_tokens": 177516405.0, + "step": 4653 + }, + { + "epoch": 0.5920366365602341, + "grad_norm": 1.5446187257766724, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8782563209533691, + "num_tokens": 177551240.0, + "step": 4654 + }, + { + "epoch": 0.5921638468388246, + "grad_norm": 1.442711591720581, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8631994128227234, + "num_tokens": 177592889.0, + "step": 4655 + }, + { + "epoch": 0.592291057117415, + "grad_norm": 1.4538286924362183, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8671432733535767, + "num_tokens": 177630610.0, + "step": 4656 + }, + { + "epoch": 0.5924182673960056, + "grad_norm": 1.5745933055877686, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8548077344894409, + "num_tokens": 177669257.0, + "step": 4657 + }, + { + "epoch": 0.5925454776745961, + "grad_norm": 1.5099600553512573, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8757964372634888, + "num_tokens": 177705965.0, + "step": 4658 + }, + { + "epoch": 0.5926726879531866, + "grad_norm": 1.5478978157043457, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.850752592086792, + "num_tokens": 177743753.0, + "step": 4659 + }, + { + "epoch": 0.5927998982317771, + "grad_norm": 1.6105237007141113, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8625515699386597, + "num_tokens": 177774742.0, + "step": 4660 + }, + { + "epoch": 0.5929271085103677, + "grad_norm": 1.4998363256454468, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8788895010948181, + "num_tokens": 177813326.0, + "step": 4661 + }, + { + "epoch": 0.5930543187889582, + "grad_norm": 1.4989534616470337, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8599644899368286, + "num_tokens": 177851014.0, + "step": 4662 + }, + { + "epoch": 0.5931815290675486, + "grad_norm": 1.4881325960159302, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.852665364742279, + "num_tokens": 177889745.0, + "step": 4663 + }, + { + "epoch": 0.5933087393461391, + "grad_norm": 1.5636156797409058, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8649480938911438, + "num_tokens": 177927354.0, + "step": 4664 + }, + { + "epoch": 0.5934359496247297, + "grad_norm": 1.5716297626495361, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8615649938583374, + "num_tokens": 177962366.0, + "step": 4665 + }, + { + "epoch": 0.5935631599033202, + "grad_norm": 1.6493374109268188, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8601402640342712, + "num_tokens": 177997424.0, + "step": 4666 + }, + { + "epoch": 0.5936903701819107, + "grad_norm": 1.5397883653640747, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.858234167098999, + "num_tokens": 178034030.0, + "step": 4667 + }, + { + "epoch": 0.5938175804605013, + "grad_norm": 1.5198283195495605, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8735617399215698, + "num_tokens": 178069237.0, + "step": 4668 + }, + { + "epoch": 0.5939447907390917, + "grad_norm": 1.4700170755386353, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8570548295974731, + "num_tokens": 178109482.0, + "step": 4669 + }, + { + "epoch": 0.5940720010176822, + "grad_norm": 1.538520336151123, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8644450902938843, + "num_tokens": 178151432.0, + "step": 4670 + }, + { + "epoch": 0.5941992112962727, + "grad_norm": 1.4960947036743164, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8527815341949463, + "num_tokens": 178191102.0, + "step": 4671 + }, + { + "epoch": 0.5943264215748633, + "grad_norm": 1.576356291770935, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8793955445289612, + "num_tokens": 178223208.0, + "step": 4672 + }, + { + "epoch": 0.5944536318534538, + "grad_norm": 1.6134201288223267, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8602725267410278, + "num_tokens": 178256134.0, + "step": 4673 + }, + { + "epoch": 0.5945808421320443, + "grad_norm": 1.5016543865203857, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8691598176956177, + "num_tokens": 178297562.0, + "step": 4674 + }, + { + "epoch": 0.5947080524106347, + "grad_norm": 1.539939045906067, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8734574317932129, + "num_tokens": 178338956.0, + "step": 4675 + }, + { + "epoch": 0.5948352626892253, + "grad_norm": 1.414038896560669, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8721964359283447, + "num_tokens": 178376881.0, + "step": 4676 + }, + { + "epoch": 0.5949624729678158, + "grad_norm": 1.739166021347046, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8663960695266724, + "num_tokens": 178411466.0, + "step": 4677 + }, + { + "epoch": 0.5950896832464063, + "grad_norm": 1.6117554903030396, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8667383193969727, + "num_tokens": 178446669.0, + "step": 4678 + }, + { + "epoch": 0.5952168935249968, + "grad_norm": 1.7232047319412231, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8436390161514282, + "num_tokens": 178481012.0, + "step": 4679 + }, + { + "epoch": 0.5953441038035874, + "grad_norm": 1.378686547279358, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.869203507900238, + "num_tokens": 178524897.0, + "step": 4680 + }, + { + "epoch": 0.5954713140821778, + "grad_norm": 1.5123686790466309, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8627113699913025, + "num_tokens": 178564898.0, + "step": 4681 + }, + { + "epoch": 0.5955985243607683, + "grad_norm": 1.4834692478179932, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8603655695915222, + "num_tokens": 178604627.0, + "step": 4682 + }, + { + "epoch": 0.5957257346393589, + "grad_norm": 1.5256847143173218, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8654680252075195, + "num_tokens": 178643483.0, + "step": 4683 + }, + { + "epoch": 0.5958529449179494, + "grad_norm": 1.5880519151687622, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8602390289306641, + "num_tokens": 178680379.0, + "step": 4684 + }, + { + "epoch": 0.5959801551965399, + "grad_norm": 1.483829140663147, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8914104700088501, + "num_tokens": 178714528.0, + "step": 4685 + }, + { + "epoch": 0.5961073654751304, + "grad_norm": 1.515546202659607, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8671324253082275, + "num_tokens": 178755324.0, + "step": 4686 + }, + { + "epoch": 0.5962345757537209, + "grad_norm": 1.441610336303711, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8732683658599854, + "num_tokens": 178791897.0, + "step": 4687 + }, + { + "epoch": 0.5963617860323114, + "grad_norm": 1.6660499572753906, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8659020662307739, + "num_tokens": 178824595.0, + "step": 4688 + }, + { + "epoch": 0.5964889963109019, + "grad_norm": 1.5130211114883423, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8790581226348877, + "num_tokens": 178862264.0, + "step": 4689 + }, + { + "epoch": 0.5966162065894924, + "grad_norm": 1.5100979804992676, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8537837862968445, + "num_tokens": 178904131.0, + "step": 4690 + }, + { + "epoch": 0.596743416868083, + "grad_norm": 1.4825294017791748, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8704802989959717, + "num_tokens": 178942570.0, + "step": 4691 + }, + { + "epoch": 0.5968706271466735, + "grad_norm": 1.5107378959655762, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8630849123001099, + "num_tokens": 178983954.0, + "step": 4692 + }, + { + "epoch": 0.5969978374252639, + "grad_norm": 1.4839816093444824, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8569903373718262, + "num_tokens": 179025634.0, + "step": 4693 + }, + { + "epoch": 0.5971250477038544, + "grad_norm": 1.5916154384613037, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8587368726730347, + "num_tokens": 179062797.0, + "step": 4694 + }, + { + "epoch": 0.597252257982445, + "grad_norm": 1.5380380153656006, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8636364936828613, + "num_tokens": 179097801.0, + "step": 4695 + }, + { + "epoch": 0.5973794682610355, + "grad_norm": 1.6461374759674072, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8462827205657959, + "num_tokens": 179133078.0, + "step": 4696 + }, + { + "epoch": 0.597506678539626, + "grad_norm": 1.520262598991394, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8684477210044861, + "num_tokens": 179172716.0, + "step": 4697 + }, + { + "epoch": 0.5976338888182166, + "grad_norm": 1.6809771060943604, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8651677370071411, + "num_tokens": 179206390.0, + "step": 4698 + }, + { + "epoch": 0.597761099096807, + "grad_norm": 1.5928953886032104, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8687816858291626, + "num_tokens": 179239665.0, + "step": 4699 + }, + { + "epoch": 0.5978883093753975, + "grad_norm": 1.3246372938156128, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8801531791687012, + "num_tokens": 179279944.0, + "step": 4700 + }, + { + "epoch": 0.598015519653988, + "grad_norm": 1.4225270748138428, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8608784079551697, + "num_tokens": 179319527.0, + "step": 4701 + }, + { + "epoch": 0.5981427299325786, + "grad_norm": 1.4615670442581177, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8614793419837952, + "num_tokens": 179358674.0, + "step": 4702 + }, + { + "epoch": 0.5982699402111691, + "grad_norm": 1.355242133140564, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8728964328765869, + "num_tokens": 179403331.0, + "step": 4703 + }, + { + "epoch": 0.5983971504897596, + "grad_norm": 1.5250657796859741, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8654459714889526, + "num_tokens": 179440091.0, + "step": 4704 + }, + { + "epoch": 0.59852436076835, + "grad_norm": 1.5734081268310547, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.864345371723175, + "num_tokens": 179474277.0, + "step": 4705 + }, + { + "epoch": 0.5986515710469406, + "grad_norm": 1.360527515411377, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8764973878860474, + "num_tokens": 179516756.0, + "step": 4706 + }, + { + "epoch": 0.5987787813255311, + "grad_norm": 1.484716534614563, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.870171308517456, + "num_tokens": 179550813.0, + "step": 4707 + }, + { + "epoch": 0.5989059916041216, + "grad_norm": 1.4261146783828735, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.863629937171936, + "num_tokens": 179593040.0, + "step": 4708 + }, + { + "epoch": 0.5990332018827121, + "grad_norm": 1.6323051452636719, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8468849658966064, + "num_tokens": 179629034.0, + "step": 4709 + }, + { + "epoch": 0.5991604121613027, + "grad_norm": 1.4310799837112427, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8541238903999329, + "num_tokens": 179673674.0, + "step": 4710 + }, + { + "epoch": 0.5992876224398932, + "grad_norm": 1.45699942111969, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.865878701210022, + "num_tokens": 179712829.0, + "step": 4711 + }, + { + "epoch": 0.5994148327184836, + "grad_norm": 1.4868865013122559, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.872596025466919, + "num_tokens": 179749830.0, + "step": 4712 + }, + { + "epoch": 0.5995420429970741, + "grad_norm": 1.4770734310150146, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.855979859828949, + "num_tokens": 179790594.0, + "step": 4713 + }, + { + "epoch": 0.5996692532756647, + "grad_norm": 1.49191153049469, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8660227060317993, + "num_tokens": 179831138.0, + "step": 4714 + }, + { + "epoch": 0.5997964635542552, + "grad_norm": 1.5145493745803833, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8640656471252441, + "num_tokens": 179869954.0, + "step": 4715 + }, + { + "epoch": 0.5999236738328457, + "grad_norm": 1.5454596281051636, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.849950909614563, + "num_tokens": 179905577.0, + "step": 4716 + }, + { + "epoch": 0.6000508841114363, + "grad_norm": 1.490769386291504, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8747715950012207, + "num_tokens": 179940966.0, + "step": 4717 + }, + { + "epoch": 0.6001780943900267, + "grad_norm": 1.5288572311401367, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8619129657745361, + "num_tokens": 179976814.0, + "step": 4718 + }, + { + "epoch": 0.6003053046686172, + "grad_norm": 1.5760756731033325, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8543518781661987, + "num_tokens": 180013398.0, + "step": 4719 + }, + { + "epoch": 0.6004325149472077, + "grad_norm": 1.5167465209960938, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8669821619987488, + "num_tokens": 180051370.0, + "step": 4720 + }, + { + "epoch": 0.6005597252257983, + "grad_norm": 1.597312331199646, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8588851690292358, + "num_tokens": 180087071.0, + "step": 4721 + }, + { + "epoch": 0.6006869355043888, + "grad_norm": 1.3710073232650757, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8662866353988647, + "num_tokens": 180131576.0, + "step": 4722 + }, + { + "epoch": 0.6008141457829793, + "grad_norm": 1.5056085586547852, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8702760934829712, + "num_tokens": 180170753.0, + "step": 4723 + }, + { + "epoch": 0.6009413560615697, + "grad_norm": 1.4766982793807983, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.867324709892273, + "num_tokens": 180207110.0, + "step": 4724 + }, + { + "epoch": 0.6010685663401603, + "grad_norm": 1.5466641187667847, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8615553379058838, + "num_tokens": 180238914.0, + "step": 4725 + }, + { + "epoch": 0.6011957766187508, + "grad_norm": 1.6886146068572998, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8628033399581909, + "num_tokens": 180271799.0, + "step": 4726 + }, + { + "epoch": 0.6013229868973413, + "grad_norm": 1.5208066701889038, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8803591728210449, + "num_tokens": 180306693.0, + "step": 4727 + }, + { + "epoch": 0.6014501971759318, + "grad_norm": 1.4837441444396973, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8750842809677124, + "num_tokens": 180341698.0, + "step": 4728 + }, + { + "epoch": 0.6015774074545224, + "grad_norm": 1.4614291191101074, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8823124766349792, + "num_tokens": 180377994.0, + "step": 4729 + }, + { + "epoch": 0.6017046177331128, + "grad_norm": 1.6374889612197876, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8513565063476562, + "num_tokens": 180414384.0, + "step": 4730 + }, + { + "epoch": 0.6018318280117033, + "grad_norm": 1.6572370529174805, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8632298707962036, + "num_tokens": 180448174.0, + "step": 4731 + }, + { + "epoch": 0.6019590382902938, + "grad_norm": 1.4539105892181396, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8749532103538513, + "num_tokens": 180489780.0, + "step": 4732 + }, + { + "epoch": 0.6020862485688844, + "grad_norm": 1.4769552946090698, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8714518547058105, + "num_tokens": 180531289.0, + "step": 4733 + }, + { + "epoch": 0.6022134588474749, + "grad_norm": 1.5642385482788086, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8559102416038513, + "num_tokens": 180567820.0, + "step": 4734 + }, + { + "epoch": 0.6023406691260654, + "grad_norm": 1.5190303325653076, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8515602350234985, + "num_tokens": 180606745.0, + "step": 4735 + }, + { + "epoch": 0.6024678794046558, + "grad_norm": 1.504896879196167, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8656697869300842, + "num_tokens": 180642040.0, + "step": 4736 + }, + { + "epoch": 0.6025950896832464, + "grad_norm": 1.4616961479187012, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8734755516052246, + "num_tokens": 180679113.0, + "step": 4737 + }, + { + "epoch": 0.6027222999618369, + "grad_norm": 1.5189062356948853, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.880635142326355, + "num_tokens": 180713742.0, + "step": 4738 + }, + { + "epoch": 0.6028495102404274, + "grad_norm": 1.5703579187393188, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8651669025421143, + "num_tokens": 180749473.0, + "step": 4739 + }, + { + "epoch": 0.602976720519018, + "grad_norm": 1.6315240859985352, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8697630167007446, + "num_tokens": 180783242.0, + "step": 4740 + }, + { + "epoch": 0.6031039307976085, + "grad_norm": 1.4618765115737915, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8624675273895264, + "num_tokens": 180823139.0, + "step": 4741 + }, + { + "epoch": 0.6032311410761989, + "grad_norm": 1.562079906463623, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8513281941413879, + "num_tokens": 180858724.0, + "step": 4742 + }, + { + "epoch": 0.6033583513547894, + "grad_norm": 1.5549019575119019, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8718328475952148, + "num_tokens": 180894034.0, + "step": 4743 + }, + { + "epoch": 0.60348556163338, + "grad_norm": 1.3982731103897095, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8732821941375732, + "num_tokens": 180933049.0, + "step": 4744 + }, + { + "epoch": 0.6036127719119705, + "grad_norm": 1.5221792459487915, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8782457709312439, + "num_tokens": 180966416.0, + "step": 4745 + }, + { + "epoch": 0.603739982190561, + "grad_norm": 1.6360080242156982, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8589247465133667, + "num_tokens": 181003972.0, + "step": 4746 + }, + { + "epoch": 0.6038671924691515, + "grad_norm": 1.4602251052856445, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8657895922660828, + "num_tokens": 181043269.0, + "step": 4747 + }, + { + "epoch": 0.603994402747742, + "grad_norm": 1.4188485145568848, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.888786792755127, + "num_tokens": 181084009.0, + "step": 4748 + }, + { + "epoch": 0.6041216130263325, + "grad_norm": 1.4868344068527222, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8676345348358154, + "num_tokens": 181124927.0, + "step": 4749 + }, + { + "epoch": 0.604248823304923, + "grad_norm": 1.5582116842269897, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8607624173164368, + "num_tokens": 181158987.0, + "step": 4750 + }, + { + "epoch": 0.6043760335835135, + "grad_norm": 1.5276597738265991, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8464482426643372, + "num_tokens": 181198281.0, + "step": 4751 + }, + { + "epoch": 0.6045032438621041, + "grad_norm": 1.6303361654281616, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8653296232223511, + "num_tokens": 181233169.0, + "step": 4752 + }, + { + "epoch": 0.6046304541406946, + "grad_norm": 1.6110913753509521, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8500539064407349, + "num_tokens": 181268432.0, + "step": 4753 + }, + { + "epoch": 0.604757664419285, + "grad_norm": 1.4528064727783203, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8598398566246033, + "num_tokens": 181307430.0, + "step": 4754 + }, + { + "epoch": 0.6048848746978756, + "grad_norm": 1.7172493934631348, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.841633141040802, + "num_tokens": 181345092.0, + "step": 4755 + }, + { + "epoch": 0.6050120849764661, + "grad_norm": 1.4538681507110596, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8633080720901489, + "num_tokens": 181381621.0, + "step": 4756 + }, + { + "epoch": 0.6051392952550566, + "grad_norm": 1.4050918817520142, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8667176961898804, + "num_tokens": 181423613.0, + "step": 4757 + }, + { + "epoch": 0.6052665055336471, + "grad_norm": 1.4402226209640503, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8744947910308838, + "num_tokens": 181462559.0, + "step": 4758 + }, + { + "epoch": 0.6053937158122377, + "grad_norm": 1.5062683820724487, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.856050968170166, + "num_tokens": 181498713.0, + "step": 4759 + }, + { + "epoch": 0.6055209260908282, + "grad_norm": 1.5679055452346802, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8671038746833801, + "num_tokens": 181531001.0, + "step": 4760 + }, + { + "epoch": 0.6056481363694186, + "grad_norm": 1.5974079370498657, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8693663477897644, + "num_tokens": 181569657.0, + "step": 4761 + }, + { + "epoch": 0.6057753466480091, + "grad_norm": 1.4100642204284668, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8802404403686523, + "num_tokens": 181605731.0, + "step": 4762 + }, + { + "epoch": 0.6059025569265997, + "grad_norm": 1.4478669166564941, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8578913807868958, + "num_tokens": 181648829.0, + "step": 4763 + }, + { + "epoch": 0.6060297672051902, + "grad_norm": 1.4201823472976685, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8503122329711914, + "num_tokens": 181693762.0, + "step": 4764 + }, + { + "epoch": 0.6061569774837807, + "grad_norm": 1.4780532121658325, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8756375312805176, + "num_tokens": 181730462.0, + "step": 4765 + }, + { + "epoch": 0.6062841877623713, + "grad_norm": 1.5073683261871338, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8357110023498535, + "num_tokens": 181772837.0, + "step": 4766 + }, + { + "epoch": 0.6064113980409617, + "grad_norm": 1.425397276878357, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8560808897018433, + "num_tokens": 181818505.0, + "step": 4767 + }, + { + "epoch": 0.6065386083195522, + "grad_norm": 1.6147195100784302, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8522353172302246, + "num_tokens": 181856627.0, + "step": 4768 + }, + { + "epoch": 0.6066658185981427, + "grad_norm": 1.4590179920196533, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.852494478225708, + "num_tokens": 181897263.0, + "step": 4769 + }, + { + "epoch": 0.6067930288767333, + "grad_norm": 1.5048980712890625, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8702324628829956, + "num_tokens": 181932200.0, + "step": 4770 + }, + { + "epoch": 0.6069202391553238, + "grad_norm": 1.62050461769104, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.866897463798523, + "num_tokens": 181963683.0, + "step": 4771 + }, + { + "epoch": 0.6070474494339143, + "grad_norm": 1.684609055519104, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.848558783531189, + "num_tokens": 181999822.0, + "step": 4772 + }, + { + "epoch": 0.6071746597125047, + "grad_norm": 1.5969114303588867, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8755506873130798, + "num_tokens": 182033103.0, + "step": 4773 + }, + { + "epoch": 0.6073018699910953, + "grad_norm": 1.4265341758728027, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8795663118362427, + "num_tokens": 182072245.0, + "step": 4774 + }, + { + "epoch": 0.6074290802696858, + "grad_norm": 1.3814153671264648, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.861094057559967, + "num_tokens": 182113578.0, + "step": 4775 + }, + { + "epoch": 0.6075562905482763, + "grad_norm": 1.5296698808670044, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8881959915161133, + "num_tokens": 182147604.0, + "step": 4776 + }, + { + "epoch": 0.6076835008268668, + "grad_norm": 1.4468129873275757, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8704186677932739, + "num_tokens": 182184900.0, + "step": 4777 + }, + { + "epoch": 0.6078107111054574, + "grad_norm": 1.5213522911071777, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8633408546447754, + "num_tokens": 182220932.0, + "step": 4778 + }, + { + "epoch": 0.6079379213840478, + "grad_norm": 1.5058495998382568, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8606970310211182, + "num_tokens": 182259815.0, + "step": 4779 + }, + { + "epoch": 0.6080651316626383, + "grad_norm": 1.4961947202682495, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8785904049873352, + "num_tokens": 182292245.0, + "step": 4780 + }, + { + "epoch": 0.6081923419412288, + "grad_norm": 1.4523662328720093, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8738365769386292, + "num_tokens": 182330733.0, + "step": 4781 + }, + { + "epoch": 0.6083195522198194, + "grad_norm": 1.5420290231704712, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8659937977790833, + "num_tokens": 182364261.0, + "step": 4782 + }, + { + "epoch": 0.6084467624984099, + "grad_norm": 1.5456064939498901, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8610284328460693, + "num_tokens": 182400723.0, + "step": 4783 + }, + { + "epoch": 0.6085739727770004, + "grad_norm": 1.4930400848388672, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8522140979766846, + "num_tokens": 182441087.0, + "step": 4784 + }, + { + "epoch": 0.6087011830555908, + "grad_norm": 1.5756906270980835, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8528228998184204, + "num_tokens": 182477123.0, + "step": 4785 + }, + { + "epoch": 0.6088283933341814, + "grad_norm": 1.644639253616333, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8553478717803955, + "num_tokens": 182510213.0, + "step": 4786 + }, + { + "epoch": 0.6089556036127719, + "grad_norm": 1.4820958375930786, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8673780560493469, + "num_tokens": 182548773.0, + "step": 4787 + }, + { + "epoch": 0.6090828138913624, + "grad_norm": 1.5225965976715088, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8570541143417358, + "num_tokens": 182586507.0, + "step": 4788 + }, + { + "epoch": 0.609210024169953, + "grad_norm": 1.534011960029602, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8668044209480286, + "num_tokens": 182622571.0, + "step": 4789 + }, + { + "epoch": 0.6093372344485435, + "grad_norm": 1.5833629369735718, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.848883867263794, + "num_tokens": 182662240.0, + "step": 4790 + }, + { + "epoch": 0.6094644447271339, + "grad_norm": 1.5917576551437378, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8576328754425049, + "num_tokens": 182700175.0, + "step": 4791 + }, + { + "epoch": 0.6095916550057244, + "grad_norm": 1.5584685802459717, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8568175435066223, + "num_tokens": 182738356.0, + "step": 4792 + }, + { + "epoch": 0.609718865284315, + "grad_norm": 1.3870190382003784, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8638105392456055, + "num_tokens": 182777810.0, + "step": 4793 + }, + { + "epoch": 0.6098460755629055, + "grad_norm": 1.536948561668396, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8753067255020142, + "num_tokens": 182814944.0, + "step": 4794 + }, + { + "epoch": 0.609973285841496, + "grad_norm": 1.445895791053772, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8649779558181763, + "num_tokens": 182855325.0, + "step": 4795 + }, + { + "epoch": 0.6101004961200865, + "grad_norm": 1.463032603263855, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8750245571136475, + "num_tokens": 182893196.0, + "step": 4796 + }, + { + "epoch": 0.610227706398677, + "grad_norm": 1.460968017578125, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8738033175468445, + "num_tokens": 182929993.0, + "step": 4797 + }, + { + "epoch": 0.6103549166772675, + "grad_norm": 1.458983302116394, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8553046584129333, + "num_tokens": 182973655.0, + "step": 4798 + }, + { + "epoch": 0.610482126955858, + "grad_norm": 1.5498958826065063, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8691715002059937, + "num_tokens": 183008062.0, + "step": 4799 + }, + { + "epoch": 0.6106093372344485, + "grad_norm": 1.6847186088562012, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8599206209182739, + "num_tokens": 183044596.0, + "step": 4800 + }, + { + "epoch": 0.6107365475130391, + "grad_norm": 1.614869475364685, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8478243350982666, + "num_tokens": 183083229.0, + "step": 4801 + }, + { + "epoch": 0.6108637577916296, + "grad_norm": 1.5150175094604492, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8626266121864319, + "num_tokens": 183121788.0, + "step": 4802 + }, + { + "epoch": 0.61099096807022, + "grad_norm": 1.6910667419433594, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8577901124954224, + "num_tokens": 183152634.0, + "step": 4803 + }, + { + "epoch": 0.6111181783488105, + "grad_norm": 1.4361627101898193, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8748890161514282, + "num_tokens": 183189273.0, + "step": 4804 + }, + { + "epoch": 0.6112453886274011, + "grad_norm": 1.5597490072250366, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8678439855575562, + "num_tokens": 183228092.0, + "step": 4805 + }, + { + "epoch": 0.6113725989059916, + "grad_norm": 1.5994337797164917, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8707005977630615, + "num_tokens": 183261625.0, + "step": 4806 + }, + { + "epoch": 0.6114998091845821, + "grad_norm": 1.3766459226608276, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8800094127655029, + "num_tokens": 183301989.0, + "step": 4807 + }, + { + "epoch": 0.6116270194631727, + "grad_norm": 1.3910112380981445, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8771747946739197, + "num_tokens": 183344518.0, + "step": 4808 + }, + { + "epoch": 0.6117542297417632, + "grad_norm": 1.6468998193740845, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8677151203155518, + "num_tokens": 183376082.0, + "step": 4809 + }, + { + "epoch": 0.6118814400203536, + "grad_norm": 1.548169732093811, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8520643711090088, + "num_tokens": 183414608.0, + "step": 4810 + }, + { + "epoch": 0.6120086502989441, + "grad_norm": 1.5745068788528442, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8713585138320923, + "num_tokens": 183451091.0, + "step": 4811 + }, + { + "epoch": 0.6121358605775347, + "grad_norm": 1.5067811012268066, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8649479150772095, + "num_tokens": 183490286.0, + "step": 4812 + }, + { + "epoch": 0.6122630708561252, + "grad_norm": 1.5104269981384277, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8527824878692627, + "num_tokens": 183529298.0, + "step": 4813 + }, + { + "epoch": 0.6123902811347157, + "grad_norm": 1.3846054077148438, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8672090172767639, + "num_tokens": 183573298.0, + "step": 4814 + }, + { + "epoch": 0.6125174914133062, + "grad_norm": 1.366856336593628, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8597887754440308, + "num_tokens": 183617974.0, + "step": 4815 + }, + { + "epoch": 0.6126447016918967, + "grad_norm": 1.434933066368103, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8674451112747192, + "num_tokens": 183660985.0, + "step": 4816 + }, + { + "epoch": 0.6127719119704872, + "grad_norm": 1.5266369581222534, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8675342202186584, + "num_tokens": 183695969.0, + "step": 4817 + }, + { + "epoch": 0.6128991222490777, + "grad_norm": 1.544378399848938, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.860676646232605, + "num_tokens": 183736004.0, + "step": 4818 + }, + { + "epoch": 0.6130263325276682, + "grad_norm": 1.4031953811645508, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8770692348480225, + "num_tokens": 183770967.0, + "step": 4819 + }, + { + "epoch": 0.6131535428062588, + "grad_norm": 1.5050543546676636, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8703321814537048, + "num_tokens": 183806776.0, + "step": 4820 + }, + { + "epoch": 0.6132807530848493, + "grad_norm": 1.3819724321365356, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.861316442489624, + "num_tokens": 183849559.0, + "step": 4821 + }, + { + "epoch": 0.6134079633634397, + "grad_norm": 1.2776802778244019, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8689590692520142, + "num_tokens": 183897547.0, + "step": 4822 + }, + { + "epoch": 0.6135351736420303, + "grad_norm": 1.611067771911621, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8640072345733643, + "num_tokens": 183933889.0, + "step": 4823 + }, + { + "epoch": 0.6136623839206208, + "grad_norm": 1.3298379182815552, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.876868486404419, + "num_tokens": 183977168.0, + "step": 4824 + }, + { + "epoch": 0.6137895941992113, + "grad_norm": 1.4008742570877075, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8853527307510376, + "num_tokens": 184014371.0, + "step": 4825 + }, + { + "epoch": 0.6139168044778018, + "grad_norm": 1.5913043022155762, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8579264879226685, + "num_tokens": 184049626.0, + "step": 4826 + }, + { + "epoch": 0.6140440147563924, + "grad_norm": 1.6149753332138062, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8670224547386169, + "num_tokens": 184082308.0, + "step": 4827 + }, + { + "epoch": 0.6141712250349828, + "grad_norm": 1.353858232498169, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8746297955513, + "num_tokens": 184124580.0, + "step": 4828 + }, + { + "epoch": 0.6142984353135733, + "grad_norm": 1.452358603477478, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8590282201766968, + "num_tokens": 184165069.0, + "step": 4829 + }, + { + "epoch": 0.6144256455921638, + "grad_norm": 1.44640052318573, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8772375583648682, + "num_tokens": 184200763.0, + "step": 4830 + }, + { + "epoch": 0.6145528558707544, + "grad_norm": 1.5852819681167603, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8548089861869812, + "num_tokens": 184238628.0, + "step": 4831 + }, + { + "epoch": 0.6146800661493449, + "grad_norm": 1.5652475357055664, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8816885948181152, + "num_tokens": 184268238.0, + "step": 4832 + }, + { + "epoch": 0.6148072764279354, + "grad_norm": 1.427348017692566, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8649638891220093, + "num_tokens": 184308805.0, + "step": 4833 + }, + { + "epoch": 0.6149344867065258, + "grad_norm": 1.4677473306655884, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.868125319480896, + "num_tokens": 184350501.0, + "step": 4834 + }, + { + "epoch": 0.6150616969851164, + "grad_norm": 1.502785563468933, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8606374263763428, + "num_tokens": 184395462.0, + "step": 4835 + }, + { + "epoch": 0.6151889072637069, + "grad_norm": 1.3987263441085815, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8730300664901733, + "num_tokens": 184434895.0, + "step": 4836 + }, + { + "epoch": 0.6153161175422974, + "grad_norm": 1.4665727615356445, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8628674745559692, + "num_tokens": 184474190.0, + "step": 4837 + }, + { + "epoch": 0.615443327820888, + "grad_norm": 1.5348539352416992, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8653843402862549, + "num_tokens": 184508779.0, + "step": 4838 + }, + { + "epoch": 0.6155705380994785, + "grad_norm": 1.3537653684616089, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8685463070869446, + "num_tokens": 184553023.0, + "step": 4839 + }, + { + "epoch": 0.6156977483780689, + "grad_norm": 1.6034948825836182, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8652130365371704, + "num_tokens": 184586732.0, + "step": 4840 + }, + { + "epoch": 0.6158249586566594, + "grad_norm": 1.4603713750839233, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8716332912445068, + "num_tokens": 184632020.0, + "step": 4841 + }, + { + "epoch": 0.61595216893525, + "grad_norm": 1.5609241724014282, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8523446321487427, + "num_tokens": 184670062.0, + "step": 4842 + }, + { + "epoch": 0.6160793792138405, + "grad_norm": 1.5165729522705078, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8772646188735962, + "num_tokens": 184706850.0, + "step": 4843 + }, + { + "epoch": 0.616206589492431, + "grad_norm": 1.529370903968811, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8753920793533325, + "num_tokens": 184741739.0, + "step": 4844 + }, + { + "epoch": 0.6163337997710215, + "grad_norm": 1.446706771850586, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8720162510871887, + "num_tokens": 184780417.0, + "step": 4845 + }, + { + "epoch": 0.616461010049612, + "grad_norm": 1.4678479433059692, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8759571313858032, + "num_tokens": 184817131.0, + "step": 4846 + }, + { + "epoch": 0.6165882203282025, + "grad_norm": 1.5448973178863525, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8800429105758667, + "num_tokens": 184855544.0, + "step": 4847 + }, + { + "epoch": 0.616715430606793, + "grad_norm": 1.5906325578689575, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.865608274936676, + "num_tokens": 184893854.0, + "step": 4848 + }, + { + "epoch": 0.6168426408853835, + "grad_norm": 1.558735966682434, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8639519214630127, + "num_tokens": 184927570.0, + "step": 4849 + }, + { + "epoch": 0.6169698511639741, + "grad_norm": 1.4535478353500366, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8760392665863037, + "num_tokens": 184965097.0, + "step": 4850 + }, + { + "epoch": 0.6170970614425646, + "grad_norm": 1.674721360206604, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.875362753868103, + "num_tokens": 184998075.0, + "step": 4851 + }, + { + "epoch": 0.617224271721155, + "grad_norm": 1.6064670085906982, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8447703123092651, + "num_tokens": 185035820.0, + "step": 4852 + }, + { + "epoch": 0.6173514819997455, + "grad_norm": 1.6245276927947998, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8663784265518188, + "num_tokens": 185076814.0, + "step": 4853 + }, + { + "epoch": 0.6174786922783361, + "grad_norm": 1.37398099899292, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8756527900695801, + "num_tokens": 185120610.0, + "step": 4854 + }, + { + "epoch": 0.6176059025569266, + "grad_norm": 1.42660391330719, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8661738634109497, + "num_tokens": 185159682.0, + "step": 4855 + }, + { + "epoch": 0.6177331128355171, + "grad_norm": 1.4917720556259155, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8639771342277527, + "num_tokens": 185198450.0, + "step": 4856 + }, + { + "epoch": 0.6178603231141077, + "grad_norm": 1.8201228380203247, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8734206557273865, + "num_tokens": 185236844.0, + "step": 4857 + }, + { + "epoch": 0.6179875333926982, + "grad_norm": 1.5762383937835693, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8627585768699646, + "num_tokens": 185271786.0, + "step": 4858 + }, + { + "epoch": 0.6181147436712886, + "grad_norm": 1.6237704753875732, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8749160170555115, + "num_tokens": 185310497.0, + "step": 4859 + }, + { + "epoch": 0.6182419539498791, + "grad_norm": 1.5258417129516602, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8713082075119019, + "num_tokens": 185345147.0, + "step": 4860 + }, + { + "epoch": 0.6183691642284697, + "grad_norm": 1.6072841882705688, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8643090128898621, + "num_tokens": 185382730.0, + "step": 4861 + }, + { + "epoch": 0.6184963745070602, + "grad_norm": 1.4788305759429932, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8551164865493774, + "num_tokens": 185422007.0, + "step": 4862 + }, + { + "epoch": 0.6186235847856507, + "grad_norm": 1.549630880355835, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8757126331329346, + "num_tokens": 185454472.0, + "step": 4863 + }, + { + "epoch": 0.6187507950642412, + "grad_norm": 1.3081942796707153, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.880475640296936, + "num_tokens": 185497761.0, + "step": 4864 + }, + { + "epoch": 0.6188780053428317, + "grad_norm": 1.6557807922363281, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8700143098831177, + "num_tokens": 185530623.0, + "step": 4865 + }, + { + "epoch": 0.6190052156214222, + "grad_norm": 1.6613515615463257, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.866551399230957, + "num_tokens": 185563246.0, + "step": 4866 + }, + { + "epoch": 0.6191324259000127, + "grad_norm": 1.38370943069458, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8768282532691956, + "num_tokens": 185604885.0, + "step": 4867 + }, + { + "epoch": 0.6192596361786032, + "grad_norm": 1.5582038164138794, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.862870454788208, + "num_tokens": 185639225.0, + "step": 4868 + }, + { + "epoch": 0.6193868464571938, + "grad_norm": 1.5215048789978027, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8627105951309204, + "num_tokens": 185676231.0, + "step": 4869 + }, + { + "epoch": 0.6195140567357843, + "grad_norm": 1.4714068174362183, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8673584461212158, + "num_tokens": 185714772.0, + "step": 4870 + }, + { + "epoch": 0.6196412670143747, + "grad_norm": 1.6361814737319946, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8806860446929932, + "num_tokens": 185748451.0, + "step": 4871 + }, + { + "epoch": 0.6197684772929652, + "grad_norm": 1.477156400680542, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.86905837059021, + "num_tokens": 185785014.0, + "step": 4872 + }, + { + "epoch": 0.6198956875715558, + "grad_norm": 1.464328408241272, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8744528293609619, + "num_tokens": 185822426.0, + "step": 4873 + }, + { + "epoch": 0.6200228978501463, + "grad_norm": 1.4129538536071777, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8796049356460571, + "num_tokens": 185860534.0, + "step": 4874 + }, + { + "epoch": 0.6201501081287368, + "grad_norm": 1.4703320264816284, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8676884174346924, + "num_tokens": 185899635.0, + "step": 4875 + }, + { + "epoch": 0.6202773184073274, + "grad_norm": 1.4915119409561157, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8670456409454346, + "num_tokens": 185939480.0, + "step": 4876 + }, + { + "epoch": 0.6204045286859178, + "grad_norm": 1.5319706201553345, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8608927130699158, + "num_tokens": 185976002.0, + "step": 4877 + }, + { + "epoch": 0.6205317389645083, + "grad_norm": 1.4026858806610107, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8616098165512085, + "num_tokens": 186017296.0, + "step": 4878 + }, + { + "epoch": 0.6206589492430988, + "grad_norm": 1.3445217609405518, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8632103204727173, + "num_tokens": 186064645.0, + "step": 4879 + }, + { + "epoch": 0.6207861595216894, + "grad_norm": 1.5552804470062256, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8672541975975037, + "num_tokens": 186097899.0, + "step": 4880 + }, + { + "epoch": 0.6209133698002799, + "grad_norm": 1.4973310232162476, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8720309138298035, + "num_tokens": 186132510.0, + "step": 4881 + }, + { + "epoch": 0.6210405800788704, + "grad_norm": 1.587459921836853, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8539773225784302, + "num_tokens": 186169042.0, + "step": 4882 + }, + { + "epoch": 0.6211677903574608, + "grad_norm": 1.4976249933242798, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8607600927352905, + "num_tokens": 186206054.0, + "step": 4883 + }, + { + "epoch": 0.6212950006360514, + "grad_norm": 1.6877586841583252, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8538662195205688, + "num_tokens": 186236909.0, + "step": 4884 + }, + { + "epoch": 0.6214222109146419, + "grad_norm": 1.37173593044281, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8762971758842468, + "num_tokens": 186280859.0, + "step": 4885 + }, + { + "epoch": 0.6215494211932324, + "grad_norm": 1.519943356513977, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8582617044448853, + "num_tokens": 186321639.0, + "step": 4886 + }, + { + "epoch": 0.621676631471823, + "grad_norm": 1.5471198558807373, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8702402710914612, + "num_tokens": 186359160.0, + "step": 4887 + }, + { + "epoch": 0.6218038417504135, + "grad_norm": 1.670182466506958, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8670969009399414, + "num_tokens": 186394175.0, + "step": 4888 + }, + { + "epoch": 0.6219310520290039, + "grad_norm": 1.4535472393035889, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8540595769882202, + "num_tokens": 186437640.0, + "step": 4889 + }, + { + "epoch": 0.6220582623075944, + "grad_norm": 1.4773519039154053, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.859782338142395, + "num_tokens": 186473730.0, + "step": 4890 + }, + { + "epoch": 0.622185472586185, + "grad_norm": 1.4914425611495972, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8693878650665283, + "num_tokens": 186508708.0, + "step": 4891 + }, + { + "epoch": 0.6223126828647755, + "grad_norm": 1.4564329385757446, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8651483058929443, + "num_tokens": 186550623.0, + "step": 4892 + }, + { + "epoch": 0.622439893143366, + "grad_norm": 1.5318578481674194, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.871741533279419, + "num_tokens": 186586915.0, + "step": 4893 + }, + { + "epoch": 0.6225671034219565, + "grad_norm": 1.524227499961853, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8481675386428833, + "num_tokens": 186624447.0, + "step": 4894 + }, + { + "epoch": 0.622694313700547, + "grad_norm": 1.5315958261489868, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8822950124740601, + "num_tokens": 186660380.0, + "step": 4895 + }, + { + "epoch": 0.6228215239791375, + "grad_norm": 1.6393564939498901, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8474836945533752, + "num_tokens": 186699353.0, + "step": 4896 + }, + { + "epoch": 0.622948734257728, + "grad_norm": 1.4653970003128052, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8711998462677002, + "num_tokens": 186736742.0, + "step": 4897 + }, + { + "epoch": 0.6230759445363185, + "grad_norm": 1.6377376317977905, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8615885376930237, + "num_tokens": 186769620.0, + "step": 4898 + }, + { + "epoch": 0.6232031548149091, + "grad_norm": 1.61221182346344, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8625309467315674, + "num_tokens": 186803175.0, + "step": 4899 + }, + { + "epoch": 0.6233303650934996, + "grad_norm": 1.5722209215164185, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8496092557907104, + "num_tokens": 186842087.0, + "step": 4900 + }, + { + "epoch": 0.62345757537209, + "grad_norm": 1.359777808189392, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8764324188232422, + "num_tokens": 186883669.0, + "step": 4901 + }, + { + "epoch": 0.6235847856506805, + "grad_norm": 1.5679320096969604, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8694029450416565, + "num_tokens": 186920182.0, + "step": 4902 + }, + { + "epoch": 0.6237119959292711, + "grad_norm": 1.4598602056503296, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8535592555999756, + "num_tokens": 186960240.0, + "step": 4903 + }, + { + "epoch": 0.6238392062078616, + "grad_norm": 1.4316902160644531, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.883408784866333, + "num_tokens": 187000037.0, + "step": 4904 + }, + { + "epoch": 0.6239664164864521, + "grad_norm": 1.6494816541671753, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8488708138465881, + "num_tokens": 187031813.0, + "step": 4905 + }, + { + "epoch": 0.6240936267650427, + "grad_norm": 1.48666512966156, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8808906078338623, + "num_tokens": 187067503.0, + "step": 4906 + }, + { + "epoch": 0.6242208370436332, + "grad_norm": 1.4251964092254639, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8651301264762878, + "num_tokens": 187109879.0, + "step": 4907 + }, + { + "epoch": 0.6243480473222236, + "grad_norm": 1.6220073699951172, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8775001764297485, + "num_tokens": 187141963.0, + "step": 4908 + }, + { + "epoch": 0.6244752576008141, + "grad_norm": 1.4833810329437256, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8681842088699341, + "num_tokens": 187183813.0, + "step": 4909 + }, + { + "epoch": 0.6246024678794047, + "grad_norm": 1.5019030570983887, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8702014684677124, + "num_tokens": 187220831.0, + "step": 4910 + }, + { + "epoch": 0.6247296781579952, + "grad_norm": 1.479332685470581, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8538864850997925, + "num_tokens": 187259738.0, + "step": 4911 + }, + { + "epoch": 0.6248568884365857, + "grad_norm": 1.4443120956420898, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.852986752986908, + "num_tokens": 187303578.0, + "step": 4912 + }, + { + "epoch": 0.6249840987151762, + "grad_norm": 1.4335025548934937, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8688114881515503, + "num_tokens": 187344021.0, + "step": 4913 + }, + { + "epoch": 0.6251113089937667, + "grad_norm": 1.7538952827453613, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8705157041549683, + "num_tokens": 187375101.0, + "step": 4914 + }, + { + "epoch": 0.6252385192723572, + "grad_norm": 1.5904419422149658, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8559657335281372, + "num_tokens": 187412325.0, + "step": 4915 + }, + { + "epoch": 0.6253657295509477, + "grad_norm": 1.4903241395950317, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8661450147628784, + "num_tokens": 187451886.0, + "step": 4916 + }, + { + "epoch": 0.6254929398295382, + "grad_norm": 1.5200575590133667, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.867103099822998, + "num_tokens": 187488840.0, + "step": 4917 + }, + { + "epoch": 0.6256201501081288, + "grad_norm": 1.5338557958602905, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8777929544448853, + "num_tokens": 187530253.0, + "step": 4918 + }, + { + "epoch": 0.6257473603867193, + "grad_norm": 1.6615022420883179, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.869385838508606, + "num_tokens": 187562143.0, + "step": 4919 + }, + { + "epoch": 0.6258745706653097, + "grad_norm": 1.5852760076522827, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.866316020488739, + "num_tokens": 187599768.0, + "step": 4920 + }, + { + "epoch": 0.6260017809439002, + "grad_norm": 1.4876587390899658, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8723241686820984, + "num_tokens": 187637485.0, + "step": 4921 + }, + { + "epoch": 0.6261289912224908, + "grad_norm": 1.4202466011047363, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8732913732528687, + "num_tokens": 187675048.0, + "step": 4922 + }, + { + "epoch": 0.6262562015010813, + "grad_norm": 1.3512059450149536, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.869140625, + "num_tokens": 187720096.0, + "step": 4923 + }, + { + "epoch": 0.6263834117796718, + "grad_norm": 1.5082746744155884, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8625733256340027, + "num_tokens": 187761354.0, + "step": 4924 + }, + { + "epoch": 0.6265106220582624, + "grad_norm": 1.5627198219299316, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.86431884765625, + "num_tokens": 187796318.0, + "step": 4925 + }, + { + "epoch": 0.6266378323368528, + "grad_norm": 1.525234341621399, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8691831827163696, + "num_tokens": 187831985.0, + "step": 4926 + }, + { + "epoch": 0.6267650426154433, + "grad_norm": 1.6071076393127441, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8546866178512573, + "num_tokens": 187866586.0, + "step": 4927 + }, + { + "epoch": 0.6268922528940338, + "grad_norm": 1.4326457977294922, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.85731440782547, + "num_tokens": 187912023.0, + "step": 4928 + }, + { + "epoch": 0.6270194631726244, + "grad_norm": 1.5570762157440186, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8556481599807739, + "num_tokens": 187947766.0, + "step": 4929 + }, + { + "epoch": 0.6271466734512149, + "grad_norm": 1.7085230350494385, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8550022840499878, + "num_tokens": 187982376.0, + "step": 4930 + }, + { + "epoch": 0.6272738837298054, + "grad_norm": 1.5534816980361938, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.863129198551178, + "num_tokens": 188021139.0, + "step": 4931 + }, + { + "epoch": 0.6274010940083958, + "grad_norm": 1.5042134523391724, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8740600347518921, + "num_tokens": 188056693.0, + "step": 4932 + }, + { + "epoch": 0.6275283042869864, + "grad_norm": 1.4799741506576538, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8714399933815002, + "num_tokens": 188096116.0, + "step": 4933 + }, + { + "epoch": 0.6276555145655769, + "grad_norm": 1.5499881505966187, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8731861114501953, + "num_tokens": 188133785.0, + "step": 4934 + }, + { + "epoch": 0.6277827248441674, + "grad_norm": 1.5269967317581177, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8524998426437378, + "num_tokens": 188171171.0, + "step": 4935 + }, + { + "epoch": 0.627909935122758, + "grad_norm": 1.5214085578918457, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8630372285842896, + "num_tokens": 188207716.0, + "step": 4936 + }, + { + "epoch": 0.6280371454013485, + "grad_norm": 1.435360312461853, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8519343733787537, + "num_tokens": 188248709.0, + "step": 4937 + }, + { + "epoch": 0.6281643556799389, + "grad_norm": 1.5813349485397339, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8688831329345703, + "num_tokens": 188283656.0, + "step": 4938 + }, + { + "epoch": 0.6282915659585294, + "grad_norm": 1.4352093935012817, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8677982091903687, + "num_tokens": 188324566.0, + "step": 4939 + }, + { + "epoch": 0.62841877623712, + "grad_norm": 1.8152445554733276, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8697537779808044, + "num_tokens": 188366781.0, + "step": 4940 + }, + { + "epoch": 0.6285459865157105, + "grad_norm": 1.526659369468689, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8740839958190918, + "num_tokens": 188407212.0, + "step": 4941 + }, + { + "epoch": 0.628673196794301, + "grad_norm": 1.5011416673660278, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8678015470504761, + "num_tokens": 188447645.0, + "step": 4942 + }, + { + "epoch": 0.6288004070728915, + "grad_norm": 1.4896695613861084, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8715386390686035, + "num_tokens": 188485438.0, + "step": 4943 + }, + { + "epoch": 0.628927617351482, + "grad_norm": 1.4208555221557617, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8612769842147827, + "num_tokens": 188528885.0, + "step": 4944 + }, + { + "epoch": 0.6290548276300725, + "grad_norm": 1.630637764930725, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8608076572418213, + "num_tokens": 188565386.0, + "step": 4945 + }, + { + "epoch": 0.629182037908663, + "grad_norm": 1.4296287298202515, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8807260394096375, + "num_tokens": 188606371.0, + "step": 4946 + }, + { + "epoch": 0.6293092481872535, + "grad_norm": 1.4237078428268433, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.873731255531311, + "num_tokens": 188644395.0, + "step": 4947 + }, + { + "epoch": 0.6294364584658441, + "grad_norm": 1.599258542060852, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8774892091751099, + "num_tokens": 188674256.0, + "step": 4948 + }, + { + "epoch": 0.6295636687444346, + "grad_norm": 1.6046714782714844, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.867534875869751, + "num_tokens": 188711205.0, + "step": 4949 + }, + { + "epoch": 0.629690879023025, + "grad_norm": 1.5720473527908325, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8745408058166504, + "num_tokens": 188748270.0, + "step": 4950 + }, + { + "epoch": 0.6298180893016155, + "grad_norm": 1.6370536088943481, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8632633686065674, + "num_tokens": 188781888.0, + "step": 4951 + }, + { + "epoch": 0.6299452995802061, + "grad_norm": 1.5000838041305542, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8706557750701904, + "num_tokens": 188819708.0, + "step": 4952 + }, + { + "epoch": 0.6300725098587966, + "grad_norm": 1.4583029747009277, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8561232089996338, + "num_tokens": 188860622.0, + "step": 4953 + }, + { + "epoch": 0.6301997201373871, + "grad_norm": 1.4452695846557617, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8665928840637207, + "num_tokens": 188899321.0, + "step": 4954 + }, + { + "epoch": 0.6303269304159776, + "grad_norm": 1.5156562328338623, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8646547198295593, + "num_tokens": 188937545.0, + "step": 4955 + }, + { + "epoch": 0.6304541406945681, + "grad_norm": 1.501009464263916, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8732378482818604, + "num_tokens": 188973658.0, + "step": 4956 + }, + { + "epoch": 0.6305813509731586, + "grad_norm": 1.459328293800354, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8514124155044556, + "num_tokens": 189017478.0, + "step": 4957 + }, + { + "epoch": 0.6307085612517491, + "grad_norm": 1.4754712581634521, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8569216728210449, + "num_tokens": 189060604.0, + "step": 4958 + }, + { + "epoch": 0.6308357715303397, + "grad_norm": 1.6497306823730469, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8631384372711182, + "num_tokens": 189095615.0, + "step": 4959 + }, + { + "epoch": 0.6309629818089302, + "grad_norm": 1.3808858394622803, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8694243431091309, + "num_tokens": 189137894.0, + "step": 4960 + }, + { + "epoch": 0.6310901920875207, + "grad_norm": 1.2802151441574097, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8696715831756592, + "num_tokens": 189184685.0, + "step": 4961 + }, + { + "epoch": 0.6312174023661112, + "grad_norm": 1.438027024269104, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.875522255897522, + "num_tokens": 189225350.0, + "step": 4962 + }, + { + "epoch": 0.6313446126447017, + "grad_norm": 1.5010592937469482, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.863754153251648, + "num_tokens": 189262391.0, + "step": 4963 + }, + { + "epoch": 0.6314718229232922, + "grad_norm": 1.5554983615875244, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8547855615615845, + "num_tokens": 189299936.0, + "step": 4964 + }, + { + "epoch": 0.6315990332018827, + "grad_norm": 1.6417936086654663, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8392516374588013, + "num_tokens": 189336887.0, + "step": 4965 + }, + { + "epoch": 0.6317262434804732, + "grad_norm": 1.5327091217041016, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8668147921562195, + "num_tokens": 189372950.0, + "step": 4966 + }, + { + "epoch": 0.6318534537590638, + "grad_norm": 1.506770372390747, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8591454029083252, + "num_tokens": 189412163.0, + "step": 4967 + }, + { + "epoch": 0.6319806640376543, + "grad_norm": 1.6731007099151611, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.844447135925293, + "num_tokens": 189448514.0, + "step": 4968 + }, + { + "epoch": 0.6321078743162447, + "grad_norm": 1.4484344720840454, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8723757863044739, + "num_tokens": 189492224.0, + "step": 4969 + }, + { + "epoch": 0.6322350845948352, + "grad_norm": 1.4293254613876343, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.853641927242279, + "num_tokens": 189532981.0, + "step": 4970 + }, + { + "epoch": 0.6323622948734258, + "grad_norm": 1.499147891998291, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8686186671257019, + "num_tokens": 189568486.0, + "step": 4971 + }, + { + "epoch": 0.6324895051520163, + "grad_norm": 1.6159892082214355, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8599623441696167, + "num_tokens": 189603860.0, + "step": 4972 + }, + { + "epoch": 0.6326167154306068, + "grad_norm": 1.430899739265442, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8369840979576111, + "num_tokens": 189646468.0, + "step": 4973 + }, + { + "epoch": 0.6327439257091974, + "grad_norm": 1.5776363611221313, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.86283278465271, + "num_tokens": 189684092.0, + "step": 4974 + }, + { + "epoch": 0.6328711359877878, + "grad_norm": 1.5622892379760742, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8578065633773804, + "num_tokens": 189718276.0, + "step": 4975 + }, + { + "epoch": 0.6329983462663783, + "grad_norm": 1.5515923500061035, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8735073804855347, + "num_tokens": 189757774.0, + "step": 4976 + }, + { + "epoch": 0.6331255565449688, + "grad_norm": 1.5139657258987427, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8684239983558655, + "num_tokens": 189796407.0, + "step": 4977 + }, + { + "epoch": 0.6332527668235594, + "grad_norm": 1.5621328353881836, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8742979168891907, + "num_tokens": 189834500.0, + "step": 4978 + }, + { + "epoch": 0.6333799771021499, + "grad_norm": 1.4946070909500122, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8711294531822205, + "num_tokens": 189872487.0, + "step": 4979 + }, + { + "epoch": 0.6335071873807404, + "grad_norm": 1.5704957246780396, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8665101528167725, + "num_tokens": 189907150.0, + "step": 4980 + }, + { + "epoch": 0.6336343976593308, + "grad_norm": 1.4186620712280273, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8858344554901123, + "num_tokens": 189942348.0, + "step": 4981 + }, + { + "epoch": 0.6337616079379214, + "grad_norm": 1.4834853410720825, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.869597315788269, + "num_tokens": 189980049.0, + "step": 4982 + }, + { + "epoch": 0.6338888182165119, + "grad_norm": 1.516408920288086, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8748173713684082, + "num_tokens": 190016773.0, + "step": 4983 + }, + { + "epoch": 0.6340160284951024, + "grad_norm": 1.41927170753479, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8732724189758301, + "num_tokens": 190053900.0, + "step": 4984 + }, + { + "epoch": 0.6341432387736929, + "grad_norm": 1.3097383975982666, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8776005506515503, + "num_tokens": 190098446.0, + "step": 4985 + }, + { + "epoch": 0.6342704490522835, + "grad_norm": 1.6257721185684204, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8674377202987671, + "num_tokens": 190130413.0, + "step": 4986 + }, + { + "epoch": 0.6343976593308739, + "grad_norm": 1.580657958984375, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8585003018379211, + "num_tokens": 190165065.0, + "step": 4987 + }, + { + "epoch": 0.6345248696094644, + "grad_norm": 1.455062747001648, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8746775984764099, + "num_tokens": 190203382.0, + "step": 4988 + }, + { + "epoch": 0.6346520798880549, + "grad_norm": 1.4088587760925293, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8672299981117249, + "num_tokens": 190243365.0, + "step": 4989 + }, + { + "epoch": 0.6347792901666455, + "grad_norm": 1.4784151315689087, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8582028746604919, + "num_tokens": 190282155.0, + "step": 4990 + }, + { + "epoch": 0.634906500445236, + "grad_norm": 1.441954255104065, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.876367449760437, + "num_tokens": 190318468.0, + "step": 4991 + }, + { + "epoch": 0.6350337107238265, + "grad_norm": 1.3751155138015747, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8811586499214172, + "num_tokens": 190357892.0, + "step": 4992 + }, + { + "epoch": 0.635160921002417, + "grad_norm": 1.5068223476409912, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8634223937988281, + "num_tokens": 190394875.0, + "step": 4993 + }, + { + "epoch": 0.6352881312810075, + "grad_norm": 1.5659629106521606, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8664692640304565, + "num_tokens": 190428606.0, + "step": 4994 + }, + { + "epoch": 0.635415341559598, + "grad_norm": 1.7064340114593506, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8557102084159851, + "num_tokens": 190466314.0, + "step": 4995 + }, + { + "epoch": 0.6355425518381885, + "grad_norm": 1.6483489274978638, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8778318166732788, + "num_tokens": 190498201.0, + "step": 4996 + }, + { + "epoch": 0.6356697621167791, + "grad_norm": 1.6337344646453857, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8467992544174194, + "num_tokens": 190535913.0, + "step": 4997 + }, + { + "epoch": 0.6357969723953696, + "grad_norm": 1.569240927696228, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8590801954269409, + "num_tokens": 190571772.0, + "step": 4998 + }, + { + "epoch": 0.63592418267396, + "grad_norm": 1.3813743591308594, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8783831000328064, + "num_tokens": 190612161.0, + "step": 4999 + }, + { + "epoch": 0.6360513929525505, + "grad_norm": 1.4751570224761963, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8540794849395752, + "num_tokens": 190652216.0, + "step": 5000 + }, + { + "epoch": 0.6361786032311411, + "grad_norm": 1.481884241104126, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8707439303398132, + "num_tokens": 190689466.0, + "step": 5001 + }, + { + "epoch": 0.6363058135097316, + "grad_norm": 1.548934817314148, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8675036430358887, + "num_tokens": 190727216.0, + "step": 5002 + }, + { + "epoch": 0.6364330237883221, + "grad_norm": 1.479830026626587, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8609390258789062, + "num_tokens": 190767534.0, + "step": 5003 + }, + { + "epoch": 0.6365602340669126, + "grad_norm": 1.5140900611877441, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8517604470252991, + "num_tokens": 190807046.0, + "step": 5004 + }, + { + "epoch": 0.6366874443455031, + "grad_norm": 1.4316595792770386, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8653724789619446, + "num_tokens": 190846849.0, + "step": 5005 + }, + { + "epoch": 0.6368146546240936, + "grad_norm": 1.4236754179000854, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.870174765586853, + "num_tokens": 190889219.0, + "step": 5006 + }, + { + "epoch": 0.6369418649026841, + "grad_norm": 1.4112131595611572, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8667160868644714, + "num_tokens": 190931497.0, + "step": 5007 + }, + { + "epoch": 0.6370690751812746, + "grad_norm": 1.6560689210891724, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8607743978500366, + "num_tokens": 190963178.0, + "step": 5008 + }, + { + "epoch": 0.6371962854598652, + "grad_norm": 1.4407848119735718, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8601556420326233, + "num_tokens": 191005639.0, + "step": 5009 + }, + { + "epoch": 0.6373234957384557, + "grad_norm": 1.6141705513000488, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8593454360961914, + "num_tokens": 191039935.0, + "step": 5010 + }, + { + "epoch": 0.6374507060170462, + "grad_norm": 1.501692533493042, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8588079214096069, + "num_tokens": 191077855.0, + "step": 5011 + }, + { + "epoch": 0.6375779162956366, + "grad_norm": 1.6495882272720337, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8612087965011597, + "num_tokens": 191111631.0, + "step": 5012 + }, + { + "epoch": 0.6377051265742272, + "grad_norm": 1.6249576807022095, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.880865216255188, + "num_tokens": 191142958.0, + "step": 5013 + }, + { + "epoch": 0.6378323368528177, + "grad_norm": 1.5834722518920898, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8663709163665771, + "num_tokens": 191180305.0, + "step": 5014 + }, + { + "epoch": 0.6379595471314082, + "grad_norm": 1.4318345785140991, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8653194904327393, + "num_tokens": 191220260.0, + "step": 5015 + }, + { + "epoch": 0.6380867574099988, + "grad_norm": 1.5569382905960083, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8592398166656494, + "num_tokens": 191259226.0, + "step": 5016 + }, + { + "epoch": 0.6382139676885893, + "grad_norm": 1.4288618564605713, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8653157949447632, + "num_tokens": 191299338.0, + "step": 5017 + }, + { + "epoch": 0.6383411779671797, + "grad_norm": 1.5035432577133179, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8590242862701416, + "num_tokens": 191334696.0, + "step": 5018 + }, + { + "epoch": 0.6384683882457702, + "grad_norm": 1.4208557605743408, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8678016662597656, + "num_tokens": 191376215.0, + "step": 5019 + }, + { + "epoch": 0.6385955985243608, + "grad_norm": 1.4914261102676392, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8769258260726929, + "num_tokens": 191411792.0, + "step": 5020 + }, + { + "epoch": 0.6387228088029513, + "grad_norm": 1.4194519519805908, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.872326135635376, + "num_tokens": 191449917.0, + "step": 5021 + }, + { + "epoch": 0.6388500190815418, + "grad_norm": 1.518186330795288, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8551068305969238, + "num_tokens": 191488437.0, + "step": 5022 + }, + { + "epoch": 0.6389772293601323, + "grad_norm": 1.5314396619796753, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8676598072052002, + "num_tokens": 191524260.0, + "step": 5023 + }, + { + "epoch": 0.6391044396387228, + "grad_norm": 1.4758858680725098, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8627387881278992, + "num_tokens": 191564356.0, + "step": 5024 + }, + { + "epoch": 0.6392316499173133, + "grad_norm": 1.4312673807144165, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8629310727119446, + "num_tokens": 191603607.0, + "step": 5025 + }, + { + "epoch": 0.6393588601959038, + "grad_norm": 1.4099892377853394, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8563399314880371, + "num_tokens": 191649584.0, + "step": 5026 + }, + { + "epoch": 0.6394860704744944, + "grad_norm": 1.4401535987854004, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8638483285903931, + "num_tokens": 191689702.0, + "step": 5027 + }, + { + "epoch": 0.6396132807530849, + "grad_norm": 1.4218454360961914, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.867271900177002, + "num_tokens": 191730536.0, + "step": 5028 + }, + { + "epoch": 0.6397404910316754, + "grad_norm": 1.3999149799346924, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.88689786195755, + "num_tokens": 191767846.0, + "step": 5029 + }, + { + "epoch": 0.6398677013102658, + "grad_norm": 1.4672999382019043, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8533955812454224, + "num_tokens": 191810929.0, + "step": 5030 + }, + { + "epoch": 0.6399949115888564, + "grad_norm": 1.6609828472137451, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8459078669548035, + "num_tokens": 191845052.0, + "step": 5031 + }, + { + "epoch": 0.6401221218674469, + "grad_norm": 1.4699229001998901, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8650702238082886, + "num_tokens": 191884522.0, + "step": 5032 + }, + { + "epoch": 0.6402493321460374, + "grad_norm": 1.6360646486282349, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8732877969741821, + "num_tokens": 191917881.0, + "step": 5033 + }, + { + "epoch": 0.6403765424246279, + "grad_norm": 1.6121653318405151, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8672248125076294, + "num_tokens": 191953808.0, + "step": 5034 + }, + { + "epoch": 0.6405037527032185, + "grad_norm": 1.5866926908493042, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8524624109268188, + "num_tokens": 191995357.0, + "step": 5035 + }, + { + "epoch": 0.6406309629818089, + "grad_norm": 1.53545081615448, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8546079397201538, + "num_tokens": 192034665.0, + "step": 5036 + }, + { + "epoch": 0.6407581732603994, + "grad_norm": 1.6242064237594604, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8614296317100525, + "num_tokens": 192068313.0, + "step": 5037 + }, + { + "epoch": 0.6408853835389899, + "grad_norm": 1.4259823560714722, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8668529391288757, + "num_tokens": 192109089.0, + "step": 5038 + }, + { + "epoch": 0.6410125938175805, + "grad_norm": 1.4705348014831543, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8608225584030151, + "num_tokens": 192149011.0, + "step": 5039 + }, + { + "epoch": 0.641139804096171, + "grad_norm": 1.5629397630691528, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.866461455821991, + "num_tokens": 192184717.0, + "step": 5040 + }, + { + "epoch": 0.6412670143747615, + "grad_norm": 1.5456318855285645, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8648902177810669, + "num_tokens": 192219760.0, + "step": 5041 + }, + { + "epoch": 0.6413942246533519, + "grad_norm": 1.4074345827102661, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8773692846298218, + "num_tokens": 192260966.0, + "step": 5042 + }, + { + "epoch": 0.6415214349319425, + "grad_norm": 1.4484935998916626, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8569896221160889, + "num_tokens": 192301899.0, + "step": 5043 + }, + { + "epoch": 0.641648645210533, + "grad_norm": 1.551578164100647, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8716484308242798, + "num_tokens": 192333344.0, + "step": 5044 + }, + { + "epoch": 0.6417758554891235, + "grad_norm": 1.6838526725769043, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8675940632820129, + "num_tokens": 192365669.0, + "step": 5045 + }, + { + "epoch": 0.641903065767714, + "grad_norm": 1.4336367845535278, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8682780265808105, + "num_tokens": 192407198.0, + "step": 5046 + }, + { + "epoch": 0.6420302760463046, + "grad_norm": 1.5033438205718994, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8509960174560547, + "num_tokens": 192444883.0, + "step": 5047 + }, + { + "epoch": 0.642157486324895, + "grad_norm": 1.4383623600006104, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8731772899627686, + "num_tokens": 192479785.0, + "step": 5048 + }, + { + "epoch": 0.6422846966034855, + "grad_norm": 1.6096197366714478, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8847609758377075, + "num_tokens": 192510534.0, + "step": 5049 + }, + { + "epoch": 0.6424119068820761, + "grad_norm": 1.4959698915481567, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8740130662918091, + "num_tokens": 192548996.0, + "step": 5050 + }, + { + "epoch": 0.6425391171606666, + "grad_norm": 1.4955170154571533, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8738241791725159, + "num_tokens": 192583909.0, + "step": 5051 + }, + { + "epoch": 0.6426663274392571, + "grad_norm": 1.4731372594833374, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8604774475097656, + "num_tokens": 192627979.0, + "step": 5052 + }, + { + "epoch": 0.6427935377178476, + "grad_norm": 1.489142894744873, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8677947521209717, + "num_tokens": 192664062.0, + "step": 5053 + }, + { + "epoch": 0.6429207479964381, + "grad_norm": 1.4784399271011353, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8676832318305969, + "num_tokens": 192704362.0, + "step": 5054 + }, + { + "epoch": 0.6430479582750286, + "grad_norm": 1.5583288669586182, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.867672324180603, + "num_tokens": 192743237.0, + "step": 5055 + }, + { + "epoch": 0.6431751685536191, + "grad_norm": 1.4938610792160034, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8617616891860962, + "num_tokens": 192782106.0, + "step": 5056 + }, + { + "epoch": 0.6433023788322096, + "grad_norm": 1.8932232856750488, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.85429847240448, + "num_tokens": 192809787.0, + "step": 5057 + }, + { + "epoch": 0.6434295891108002, + "grad_norm": 1.7002335786819458, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8677172660827637, + "num_tokens": 192842304.0, + "step": 5058 + }, + { + "epoch": 0.6435567993893907, + "grad_norm": 1.5212324857711792, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8585785031318665, + "num_tokens": 192881004.0, + "step": 5059 + }, + { + "epoch": 0.6436840096679812, + "grad_norm": 1.5214136838912964, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8771458864212036, + "num_tokens": 192914733.0, + "step": 5060 + }, + { + "epoch": 0.6438112199465716, + "grad_norm": 1.5168893337249756, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8660728931427002, + "num_tokens": 192954934.0, + "step": 5061 + }, + { + "epoch": 0.6439384302251622, + "grad_norm": 1.4189156293869019, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.855150580406189, + "num_tokens": 192998790.0, + "step": 5062 + }, + { + "epoch": 0.6440656405037527, + "grad_norm": 1.544994592666626, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8618695139884949, + "num_tokens": 193032268.0, + "step": 5063 + }, + { + "epoch": 0.6441928507823432, + "grad_norm": 1.3025039434432983, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8826116919517517, + "num_tokens": 193075604.0, + "step": 5064 + }, + { + "epoch": 0.6443200610609338, + "grad_norm": 1.4746630191802979, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8760738372802734, + "num_tokens": 193111482.0, + "step": 5065 + }, + { + "epoch": 0.6444472713395243, + "grad_norm": 1.4367010593414307, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8906857371330261, + "num_tokens": 193150731.0, + "step": 5066 + }, + { + "epoch": 0.6445744816181147, + "grad_norm": 1.4513007402420044, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8759257197380066, + "num_tokens": 193188293.0, + "step": 5067 + }, + { + "epoch": 0.6447016918967052, + "grad_norm": 1.5315953493118286, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8691626191139221, + "num_tokens": 193224963.0, + "step": 5068 + }, + { + "epoch": 0.6448289021752958, + "grad_norm": 1.5553967952728271, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.850436806678772, + "num_tokens": 193264437.0, + "step": 5069 + }, + { + "epoch": 0.6449561124538863, + "grad_norm": 1.554032325744629, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.862918496131897, + "num_tokens": 193303244.0, + "step": 5070 + }, + { + "epoch": 0.6450833227324768, + "grad_norm": 1.368124008178711, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8733510375022888, + "num_tokens": 193345083.0, + "step": 5071 + }, + { + "epoch": 0.6452105330110673, + "grad_norm": 1.4374916553497314, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8529694080352783, + "num_tokens": 193392144.0, + "step": 5072 + }, + { + "epoch": 0.6453377432896578, + "grad_norm": 1.6270791292190552, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8717077970504761, + "num_tokens": 193425972.0, + "step": 5073 + }, + { + "epoch": 0.6454649535682483, + "grad_norm": 1.5194780826568604, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8649088740348816, + "num_tokens": 193461439.0, + "step": 5074 + }, + { + "epoch": 0.6455921638468388, + "grad_norm": 1.4236328601837158, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8700243234634399, + "num_tokens": 193501833.0, + "step": 5075 + }, + { + "epoch": 0.6457193741254293, + "grad_norm": 1.6101949214935303, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8677146434783936, + "num_tokens": 193538321.0, + "step": 5076 + }, + { + "epoch": 0.6458465844040199, + "grad_norm": 1.6595816612243652, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8428184986114502, + "num_tokens": 193573260.0, + "step": 5077 + }, + { + "epoch": 0.6459737946826104, + "grad_norm": 1.6084798574447632, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8703486323356628, + "num_tokens": 193607876.0, + "step": 5078 + }, + { + "epoch": 0.6461010049612008, + "grad_norm": 1.4818246364593506, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8639440536499023, + "num_tokens": 193651237.0, + "step": 5079 + }, + { + "epoch": 0.6462282152397913, + "grad_norm": 1.540303111076355, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8651869297027588, + "num_tokens": 193693779.0, + "step": 5080 + }, + { + "epoch": 0.6463554255183819, + "grad_norm": 1.299311637878418, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8733454942703247, + "num_tokens": 193740431.0, + "step": 5081 + }, + { + "epoch": 0.6464826357969724, + "grad_norm": 1.4669501781463623, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8733507394790649, + "num_tokens": 193779980.0, + "step": 5082 + }, + { + "epoch": 0.6466098460755629, + "grad_norm": 1.55744206905365, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8560092449188232, + "num_tokens": 193816898.0, + "step": 5083 + }, + { + "epoch": 0.6467370563541535, + "grad_norm": 1.4997296333312988, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8817011117935181, + "num_tokens": 193857775.0, + "step": 5084 + }, + { + "epoch": 0.6468642666327439, + "grad_norm": 1.5443193912506104, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8662153482437134, + "num_tokens": 193897299.0, + "step": 5085 + }, + { + "epoch": 0.6469914769113344, + "grad_norm": 1.3496466875076294, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8738037347793579, + "num_tokens": 193941495.0, + "step": 5086 + }, + { + "epoch": 0.6471186871899249, + "grad_norm": 1.5268490314483643, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8709875345230103, + "num_tokens": 193978132.0, + "step": 5087 + }, + { + "epoch": 0.6472458974685155, + "grad_norm": 1.6205257177352905, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8658380508422852, + "num_tokens": 194008940.0, + "step": 5088 + }, + { + "epoch": 0.647373107747106, + "grad_norm": 1.482239842414856, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8586116433143616, + "num_tokens": 194048924.0, + "step": 5089 + }, + { + "epoch": 0.6475003180256965, + "grad_norm": 1.617097020149231, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8667858839035034, + "num_tokens": 194080082.0, + "step": 5090 + }, + { + "epoch": 0.6476275283042869, + "grad_norm": 1.5185197591781616, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8585531711578369, + "num_tokens": 194118018.0, + "step": 5091 + }, + { + "epoch": 0.6477547385828775, + "grad_norm": 1.3612478971481323, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8731709122657776, + "num_tokens": 194158472.0, + "step": 5092 + }, + { + "epoch": 0.647881948861468, + "grad_norm": 1.588052749633789, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8514448404312134, + "num_tokens": 194194338.0, + "step": 5093 + }, + { + "epoch": 0.6480091591400585, + "grad_norm": 1.563993215560913, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8715187907218933, + "num_tokens": 194229929.0, + "step": 5094 + }, + { + "epoch": 0.648136369418649, + "grad_norm": 1.6078064441680908, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8629999756813049, + "num_tokens": 194264031.0, + "step": 5095 + }, + { + "epoch": 0.6482635796972396, + "grad_norm": 1.4768445491790771, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8807500004768372, + "num_tokens": 194300987.0, + "step": 5096 + }, + { + "epoch": 0.64839078997583, + "grad_norm": 1.4868544340133667, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8623261451721191, + "num_tokens": 194340635.0, + "step": 5097 + }, + { + "epoch": 0.6485180002544205, + "grad_norm": 1.4714967012405396, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8727558851242065, + "num_tokens": 194376855.0, + "step": 5098 + }, + { + "epoch": 0.648645210533011, + "grad_norm": 1.4341208934783936, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8629499077796936, + "num_tokens": 194421639.0, + "step": 5099 + }, + { + "epoch": 0.6487724208116016, + "grad_norm": 1.3834689855575562, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8608442544937134, + "num_tokens": 194467142.0, + "step": 5100 + }, + { + "epoch": 0.6488996310901921, + "grad_norm": 1.739395260810852, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8539354205131531, + "num_tokens": 194497273.0, + "step": 5101 + }, + { + "epoch": 0.6490268413687826, + "grad_norm": 1.4976401329040527, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8689270615577698, + "num_tokens": 194533278.0, + "step": 5102 + }, + { + "epoch": 0.649154051647373, + "grad_norm": 1.4907090663909912, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8687825202941895, + "num_tokens": 194580600.0, + "step": 5103 + }, + { + "epoch": 0.6492812619259636, + "grad_norm": 1.574240803718567, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8487206697463989, + "num_tokens": 194620386.0, + "step": 5104 + }, + { + "epoch": 0.6494084722045541, + "grad_norm": 1.5204285383224487, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8667319416999817, + "num_tokens": 194655595.0, + "step": 5105 + }, + { + "epoch": 0.6495356824831446, + "grad_norm": 1.4527568817138672, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8707456588745117, + "num_tokens": 194697260.0, + "step": 5106 + }, + { + "epoch": 0.6496628927617352, + "grad_norm": 1.6534498929977417, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8553676009178162, + "num_tokens": 194730339.0, + "step": 5107 + }, + { + "epoch": 0.6497901030403257, + "grad_norm": 1.4368410110473633, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.880789041519165, + "num_tokens": 194768304.0, + "step": 5108 + }, + { + "epoch": 0.6499173133189162, + "grad_norm": 1.457425594329834, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8805705308914185, + "num_tokens": 194806185.0, + "step": 5109 + }, + { + "epoch": 0.6500445235975066, + "grad_norm": 1.4521485567092896, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8597155213356018, + "num_tokens": 194844061.0, + "step": 5110 + }, + { + "epoch": 0.6501717338760972, + "grad_norm": 1.6158826351165771, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8535975217819214, + "num_tokens": 194877467.0, + "step": 5111 + }, + { + "epoch": 0.6502989441546877, + "grad_norm": 1.5292843580245972, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8609163165092468, + "num_tokens": 194914130.0, + "step": 5112 + }, + { + "epoch": 0.6504261544332782, + "grad_norm": 1.3924696445465088, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8641382455825806, + "num_tokens": 194956447.0, + "step": 5113 + }, + { + "epoch": 0.6505533647118688, + "grad_norm": 1.596237063407898, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8590137958526611, + "num_tokens": 194992375.0, + "step": 5114 + }, + { + "epoch": 0.6506805749904593, + "grad_norm": 1.4432592391967773, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.858054518699646, + "num_tokens": 195031856.0, + "step": 5115 + }, + { + "epoch": 0.6508077852690497, + "grad_norm": 1.516218662261963, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8643836379051208, + "num_tokens": 195070661.0, + "step": 5116 + }, + { + "epoch": 0.6509349955476402, + "grad_norm": 1.5721451044082642, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.861356258392334, + "num_tokens": 195103564.0, + "step": 5117 + }, + { + "epoch": 0.6510622058262308, + "grad_norm": 1.5749566555023193, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8572059273719788, + "num_tokens": 195140484.0, + "step": 5118 + }, + { + "epoch": 0.6511894161048213, + "grad_norm": 1.4586576223373413, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8781113028526306, + "num_tokens": 195178723.0, + "step": 5119 + }, + { + "epoch": 0.6513166263834118, + "grad_norm": 1.630137324333191, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8673392534255981, + "num_tokens": 195210628.0, + "step": 5120 + }, + { + "epoch": 0.6514438366620023, + "grad_norm": 1.4289460182189941, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8705623149871826, + "num_tokens": 195252062.0, + "step": 5121 + }, + { + "epoch": 0.6515710469405928, + "grad_norm": 1.4968552589416504, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8592696189880371, + "num_tokens": 195290740.0, + "step": 5122 + }, + { + "epoch": 0.6516982572191833, + "grad_norm": 1.55780029296875, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.867768406867981, + "num_tokens": 195327564.0, + "step": 5123 + }, + { + "epoch": 0.6518254674977738, + "grad_norm": 1.3425085544586182, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8751111030578613, + "num_tokens": 195370927.0, + "step": 5124 + }, + { + "epoch": 0.6519526777763643, + "grad_norm": 1.5282779932022095, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8555103540420532, + "num_tokens": 195413732.0, + "step": 5125 + }, + { + "epoch": 0.6520798880549549, + "grad_norm": 1.534504771232605, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8645671010017395, + "num_tokens": 195450185.0, + "step": 5126 + }, + { + "epoch": 0.6522070983335454, + "grad_norm": 1.463407039642334, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8697285652160645, + "num_tokens": 195488394.0, + "step": 5127 + }, + { + "epoch": 0.6523343086121358, + "grad_norm": 1.432092308998108, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8684231042861938, + "num_tokens": 195527810.0, + "step": 5128 + }, + { + "epoch": 0.6524615188907263, + "grad_norm": 1.502793788909912, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8543693423271179, + "num_tokens": 195566669.0, + "step": 5129 + }, + { + "epoch": 0.6525887291693169, + "grad_norm": 1.5122898817062378, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8619003295898438, + "num_tokens": 195603805.0, + "step": 5130 + }, + { + "epoch": 0.6527159394479074, + "grad_norm": 1.5412355661392212, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8737390041351318, + "num_tokens": 195638257.0, + "step": 5131 + }, + { + "epoch": 0.6528431497264979, + "grad_norm": 1.5901248455047607, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8574645519256592, + "num_tokens": 195675782.0, + "step": 5132 + }, + { + "epoch": 0.6529703600050885, + "grad_norm": 1.4446247816085815, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8751020431518555, + "num_tokens": 195712069.0, + "step": 5133 + }, + { + "epoch": 0.6530975702836789, + "grad_norm": 1.5207442045211792, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8538699150085449, + "num_tokens": 195748449.0, + "step": 5134 + }, + { + "epoch": 0.6532247805622694, + "grad_norm": 1.6378921270370483, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8661901950836182, + "num_tokens": 195782518.0, + "step": 5135 + }, + { + "epoch": 0.6533519908408599, + "grad_norm": 1.4952092170715332, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8705205917358398, + "num_tokens": 195820843.0, + "step": 5136 + }, + { + "epoch": 0.6534792011194505, + "grad_norm": 1.5597376823425293, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8456388711929321, + "num_tokens": 195860215.0, + "step": 5137 + }, + { + "epoch": 0.653606411398041, + "grad_norm": 1.4323164224624634, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8819165229797363, + "num_tokens": 195903437.0, + "step": 5138 + }, + { + "epoch": 0.6537336216766315, + "grad_norm": 1.5975557565689087, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8571538925170898, + "num_tokens": 195940787.0, + "step": 5139 + }, + { + "epoch": 0.6538608319552219, + "grad_norm": 1.6468390226364136, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8563548922538757, + "num_tokens": 195976076.0, + "step": 5140 + }, + { + "epoch": 0.6539880422338125, + "grad_norm": 1.417887806892395, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8680591583251953, + "num_tokens": 196019260.0, + "step": 5141 + }, + { + "epoch": 0.654115252512403, + "grad_norm": 1.5307034254074097, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8635259866714478, + "num_tokens": 196056005.0, + "step": 5142 + }, + { + "epoch": 0.6542424627909935, + "grad_norm": 1.6165560483932495, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8569615483283997, + "num_tokens": 196092345.0, + "step": 5143 + }, + { + "epoch": 0.654369673069584, + "grad_norm": 1.5453072786331177, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8599423170089722, + "num_tokens": 196130340.0, + "step": 5144 + }, + { + "epoch": 0.6544968833481746, + "grad_norm": 1.6218377351760864, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8574029207229614, + "num_tokens": 196165663.0, + "step": 5145 + }, + { + "epoch": 0.654624093626765, + "grad_norm": 1.4647496938705444, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8727557063102722, + "num_tokens": 196206407.0, + "step": 5146 + }, + { + "epoch": 0.6547513039053555, + "grad_norm": 1.5325385332107544, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8604742288589478, + "num_tokens": 196246567.0, + "step": 5147 + }, + { + "epoch": 0.654878514183946, + "grad_norm": 1.5875937938690186, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8514093160629272, + "num_tokens": 196282017.0, + "step": 5148 + }, + { + "epoch": 0.6550057244625366, + "grad_norm": 1.4756622314453125, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8523657917976379, + "num_tokens": 196319846.0, + "step": 5149 + }, + { + "epoch": 0.6551329347411271, + "grad_norm": 1.5886787176132202, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8582369685173035, + "num_tokens": 196361256.0, + "step": 5150 + }, + { + "epoch": 0.6552601450197176, + "grad_norm": 1.4065793752670288, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8761089444160461, + "num_tokens": 196399412.0, + "step": 5151 + }, + { + "epoch": 0.655387355298308, + "grad_norm": 1.545306921005249, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8670928478240967, + "num_tokens": 196435930.0, + "step": 5152 + }, + { + "epoch": 0.6555145655768986, + "grad_norm": 1.664613127708435, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8625558614730835, + "num_tokens": 196471401.0, + "step": 5153 + }, + { + "epoch": 0.6556417758554891, + "grad_norm": 1.3447611331939697, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8768054246902466, + "num_tokens": 196514589.0, + "step": 5154 + }, + { + "epoch": 0.6557689861340796, + "grad_norm": 1.714081048965454, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8698459267616272, + "num_tokens": 196545790.0, + "step": 5155 + }, + { + "epoch": 0.6558961964126702, + "grad_norm": 1.565413475036621, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8614779114723206, + "num_tokens": 196580566.0, + "step": 5156 + }, + { + "epoch": 0.6560234066912607, + "grad_norm": 1.4985542297363281, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8646318912506104, + "num_tokens": 196622604.0, + "step": 5157 + }, + { + "epoch": 0.6561506169698512, + "grad_norm": 1.764341115951538, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8819619417190552, + "num_tokens": 196651678.0, + "step": 5158 + }, + { + "epoch": 0.6562778272484416, + "grad_norm": 1.7251836061477661, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8486173152923584, + "num_tokens": 196684977.0, + "step": 5159 + }, + { + "epoch": 0.6564050375270322, + "grad_norm": 1.6448516845703125, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8730090260505676, + "num_tokens": 196723772.0, + "step": 5160 + }, + { + "epoch": 0.6565322478056227, + "grad_norm": 1.769621729850769, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8586548566818237, + "num_tokens": 196754256.0, + "step": 5161 + }, + { + "epoch": 0.6566594580842132, + "grad_norm": 1.5405035018920898, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8638883829116821, + "num_tokens": 196791744.0, + "step": 5162 + }, + { + "epoch": 0.6567866683628037, + "grad_norm": 1.5189250707626343, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8611509203910828, + "num_tokens": 196827771.0, + "step": 5163 + }, + { + "epoch": 0.6569138786413943, + "grad_norm": 1.6107423305511475, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8797423243522644, + "num_tokens": 196858215.0, + "step": 5164 + }, + { + "epoch": 0.6570410889199847, + "grad_norm": 1.5478020906448364, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8702174425125122, + "num_tokens": 196893947.0, + "step": 5165 + }, + { + "epoch": 0.6571682991985752, + "grad_norm": 1.6387205123901367, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8655604124069214, + "num_tokens": 196923359.0, + "step": 5166 + }, + { + "epoch": 0.6572955094771658, + "grad_norm": 1.4486290216445923, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8647961616516113, + "num_tokens": 196968585.0, + "step": 5167 + }, + { + "epoch": 0.6574227197557563, + "grad_norm": 1.436408281326294, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8816872239112854, + "num_tokens": 197005672.0, + "step": 5168 + }, + { + "epoch": 0.6575499300343468, + "grad_norm": 1.6000295877456665, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8570716381072998, + "num_tokens": 197042264.0, + "step": 5169 + }, + { + "epoch": 0.6576771403129373, + "grad_norm": 1.438041090965271, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8644554018974304, + "num_tokens": 197085723.0, + "step": 5170 + }, + { + "epoch": 0.6578043505915278, + "grad_norm": 1.4444750547409058, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.864387571811676, + "num_tokens": 197127319.0, + "step": 5171 + }, + { + "epoch": 0.6579315608701183, + "grad_norm": 1.6137079000473022, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8671839237213135, + "num_tokens": 197162426.0, + "step": 5172 + }, + { + "epoch": 0.6580587711487088, + "grad_norm": 1.531167984008789, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8796122074127197, + "num_tokens": 197202875.0, + "step": 5173 + }, + { + "epoch": 0.6581859814272993, + "grad_norm": 1.6690727472305298, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.837117075920105, + "num_tokens": 197240967.0, + "step": 5174 + }, + { + "epoch": 0.6583131917058899, + "grad_norm": 1.493916392326355, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8615491390228271, + "num_tokens": 197281351.0, + "step": 5175 + }, + { + "epoch": 0.6584404019844804, + "grad_norm": 1.406472086906433, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8650329113006592, + "num_tokens": 197319377.0, + "step": 5176 + }, + { + "epoch": 0.6585676122630708, + "grad_norm": 1.4272596836090088, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8814057111740112, + "num_tokens": 197355860.0, + "step": 5177 + }, + { + "epoch": 0.6586948225416613, + "grad_norm": 1.438151478767395, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8707399368286133, + "num_tokens": 197393053.0, + "step": 5178 + }, + { + "epoch": 0.6588220328202519, + "grad_norm": 1.4725027084350586, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8569310903549194, + "num_tokens": 197432244.0, + "step": 5179 + }, + { + "epoch": 0.6589492430988424, + "grad_norm": 1.5998295545578003, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8588718175888062, + "num_tokens": 197466657.0, + "step": 5180 + }, + { + "epoch": 0.6590764533774329, + "grad_norm": 1.584490418434143, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8601922988891602, + "num_tokens": 197501443.0, + "step": 5181 + }, + { + "epoch": 0.6592036636560235, + "grad_norm": 1.4490482807159424, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8669633865356445, + "num_tokens": 197541589.0, + "step": 5182 + }, + { + "epoch": 0.6593308739346139, + "grad_norm": 1.4978939294815063, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8582031726837158, + "num_tokens": 197582790.0, + "step": 5183 + }, + { + "epoch": 0.6594580842132044, + "grad_norm": 1.4608343839645386, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8677207827568054, + "num_tokens": 197623270.0, + "step": 5184 + }, + { + "epoch": 0.6595852944917949, + "grad_norm": 1.5497345924377441, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8577384948730469, + "num_tokens": 197661865.0, + "step": 5185 + }, + { + "epoch": 0.6597125047703855, + "grad_norm": 1.6031725406646729, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8399786353111267, + "num_tokens": 197702802.0, + "step": 5186 + }, + { + "epoch": 0.659839715048976, + "grad_norm": 1.382659673690796, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8752892017364502, + "num_tokens": 197745809.0, + "step": 5187 + }, + { + "epoch": 0.6599669253275665, + "grad_norm": 1.5313079357147217, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8778382539749146, + "num_tokens": 197778862.0, + "step": 5188 + }, + { + "epoch": 0.6600941356061569, + "grad_norm": 1.6854743957519531, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.85982346534729, + "num_tokens": 197813848.0, + "step": 5189 + }, + { + "epoch": 0.6602213458847475, + "grad_norm": 1.4153223037719727, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8721209764480591, + "num_tokens": 197856815.0, + "step": 5190 + }, + { + "epoch": 0.660348556163338, + "grad_norm": 1.6479713916778564, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8574368953704834, + "num_tokens": 197888826.0, + "step": 5191 + }, + { + "epoch": 0.6604757664419285, + "grad_norm": 1.4434356689453125, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8766794800758362, + "num_tokens": 197927034.0, + "step": 5192 + }, + { + "epoch": 0.660602976720519, + "grad_norm": 1.4836468696594238, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8612064719200134, + "num_tokens": 197966494.0, + "step": 5193 + }, + { + "epoch": 0.6607301869991096, + "grad_norm": 1.6593961715698242, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8666433095932007, + "num_tokens": 198000027.0, + "step": 5194 + }, + { + "epoch": 0.6608573972777, + "grad_norm": 1.5077489614486694, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8635092377662659, + "num_tokens": 198037822.0, + "step": 5195 + }, + { + "epoch": 0.6609846075562905, + "grad_norm": 1.4820175170898438, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8547725081443787, + "num_tokens": 198078370.0, + "step": 5196 + }, + { + "epoch": 0.661111817834881, + "grad_norm": 1.5165680646896362, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8521068692207336, + "num_tokens": 198119150.0, + "step": 5197 + }, + { + "epoch": 0.6612390281134716, + "grad_norm": 1.526984691619873, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8602321147918701, + "num_tokens": 198155951.0, + "step": 5198 + }, + { + "epoch": 0.6613662383920621, + "grad_norm": 1.4661264419555664, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8649466633796692, + "num_tokens": 198196722.0, + "step": 5199 + }, + { + "epoch": 0.6614934486706526, + "grad_norm": 1.729755163192749, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8424767255783081, + "num_tokens": 198234264.0, + "step": 5200 + }, + { + "epoch": 0.661620658949243, + "grad_norm": 1.5522100925445557, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8623875379562378, + "num_tokens": 198266747.0, + "step": 5201 + }, + { + "epoch": 0.6617478692278336, + "grad_norm": 1.4504693746566772, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8681368827819824, + "num_tokens": 198304205.0, + "step": 5202 + }, + { + "epoch": 0.6618750795064241, + "grad_norm": 1.382041096687317, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8581318855285645, + "num_tokens": 198349603.0, + "step": 5203 + }, + { + "epoch": 0.6620022897850146, + "grad_norm": 1.4275399446487427, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.862449049949646, + "num_tokens": 198390905.0, + "step": 5204 + }, + { + "epoch": 0.6621295000636052, + "grad_norm": 1.5613877773284912, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8550481200218201, + "num_tokens": 198432157.0, + "step": 5205 + }, + { + "epoch": 0.6622567103421957, + "grad_norm": 1.6708499193191528, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8527325987815857, + "num_tokens": 198469669.0, + "step": 5206 + }, + { + "epoch": 0.6623839206207861, + "grad_norm": 1.5705077648162842, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8661817312240601, + "num_tokens": 198502302.0, + "step": 5207 + }, + { + "epoch": 0.6625111308993766, + "grad_norm": 1.509414792060852, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8646723031997681, + "num_tokens": 198544375.0, + "step": 5208 + }, + { + "epoch": 0.6626383411779672, + "grad_norm": 1.5679547786712646, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8525829315185547, + "num_tokens": 198584142.0, + "step": 5209 + }, + { + "epoch": 0.6627655514565577, + "grad_norm": 1.4533579349517822, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.865199863910675, + "num_tokens": 198625745.0, + "step": 5210 + }, + { + "epoch": 0.6628927617351482, + "grad_norm": 1.5991626977920532, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8870205879211426, + "num_tokens": 198662058.0, + "step": 5211 + }, + { + "epoch": 0.6630199720137387, + "grad_norm": 1.4038161039352417, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8794885873794556, + "num_tokens": 198703829.0, + "step": 5212 + }, + { + "epoch": 0.6631471822923293, + "grad_norm": 1.4073951244354248, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8656934499740601, + "num_tokens": 198747748.0, + "step": 5213 + }, + { + "epoch": 0.6632743925709197, + "grad_norm": 1.5290162563323975, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8573049902915955, + "num_tokens": 198788800.0, + "step": 5214 + }, + { + "epoch": 0.6634016028495102, + "grad_norm": 1.4697530269622803, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8708937168121338, + "num_tokens": 198825088.0, + "step": 5215 + }, + { + "epoch": 0.6635288131281007, + "grad_norm": 1.4301921129226685, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8715139031410217, + "num_tokens": 198866764.0, + "step": 5216 + }, + { + "epoch": 0.6636560234066913, + "grad_norm": 1.506314992904663, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8669447898864746, + "num_tokens": 198904120.0, + "step": 5217 + }, + { + "epoch": 0.6637832336852818, + "grad_norm": 1.5114214420318604, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8639789819717407, + "num_tokens": 198941426.0, + "step": 5218 + }, + { + "epoch": 0.6639104439638723, + "grad_norm": 1.5880507230758667, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8694498538970947, + "num_tokens": 198975997.0, + "step": 5219 + }, + { + "epoch": 0.6640376542424627, + "grad_norm": 1.3897923231124878, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8663945198059082, + "num_tokens": 199016400.0, + "step": 5220 + }, + { + "epoch": 0.6641648645210533, + "grad_norm": 1.573335886001587, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8546813726425171, + "num_tokens": 199056499.0, + "step": 5221 + }, + { + "epoch": 0.6642920747996438, + "grad_norm": 1.4359222650527954, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8698620796203613, + "num_tokens": 199096475.0, + "step": 5222 + }, + { + "epoch": 0.6644192850782343, + "grad_norm": 1.467164397239685, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8472658395767212, + "num_tokens": 199139476.0, + "step": 5223 + }, + { + "epoch": 0.6645464953568249, + "grad_norm": 1.5645720958709717, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8568727970123291, + "num_tokens": 199177108.0, + "step": 5224 + }, + { + "epoch": 0.6646737056354154, + "grad_norm": 1.3801947832107544, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8762609362602234, + "num_tokens": 199219538.0, + "step": 5225 + }, + { + "epoch": 0.6648009159140058, + "grad_norm": 1.530450463294983, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8736960291862488, + "num_tokens": 199254649.0, + "step": 5226 + }, + { + "epoch": 0.6649281261925963, + "grad_norm": 1.422382116317749, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8654555082321167, + "num_tokens": 199293206.0, + "step": 5227 + }, + { + "epoch": 0.6650553364711869, + "grad_norm": 1.466060996055603, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8759716749191284, + "num_tokens": 199328430.0, + "step": 5228 + }, + { + "epoch": 0.6651825467497774, + "grad_norm": 1.4476886987686157, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8727289438247681, + "num_tokens": 199364051.0, + "step": 5229 + }, + { + "epoch": 0.6653097570283679, + "grad_norm": 1.602355718612671, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.869915246963501, + "num_tokens": 199397389.0, + "step": 5230 + }, + { + "epoch": 0.6654369673069584, + "grad_norm": 1.5420595407485962, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8710007667541504, + "num_tokens": 199432867.0, + "step": 5231 + }, + { + "epoch": 0.6655641775855489, + "grad_norm": 1.5259300470352173, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8631237745285034, + "num_tokens": 199470389.0, + "step": 5232 + }, + { + "epoch": 0.6656913878641394, + "grad_norm": 1.505579948425293, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.880986750125885, + "num_tokens": 199505843.0, + "step": 5233 + }, + { + "epoch": 0.6658185981427299, + "grad_norm": 1.6108049154281616, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8757973909378052, + "num_tokens": 199538672.0, + "step": 5234 + }, + { + "epoch": 0.6659458084213205, + "grad_norm": 1.37325119972229, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8742284774780273, + "num_tokens": 199581483.0, + "step": 5235 + }, + { + "epoch": 0.666073018699911, + "grad_norm": 1.4384206533432007, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8569803237915039, + "num_tokens": 199627542.0, + "step": 5236 + }, + { + "epoch": 0.6662002289785015, + "grad_norm": 1.5394667387008667, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8716347813606262, + "num_tokens": 199661933.0, + "step": 5237 + }, + { + "epoch": 0.6663274392570919, + "grad_norm": 1.5620684623718262, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8484697341918945, + "num_tokens": 199700872.0, + "step": 5238 + }, + { + "epoch": 0.6664546495356825, + "grad_norm": 1.4584451913833618, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8688804507255554, + "num_tokens": 199738949.0, + "step": 5239 + }, + { + "epoch": 0.666581859814273, + "grad_norm": 1.6272010803222656, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8453093767166138, + "num_tokens": 199776556.0, + "step": 5240 + }, + { + "epoch": 0.6667090700928635, + "grad_norm": 1.53230619430542, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8417475819587708, + "num_tokens": 199817178.0, + "step": 5241 + }, + { + "epoch": 0.666836280371454, + "grad_norm": 1.568615436553955, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8647710084915161, + "num_tokens": 199852874.0, + "step": 5242 + }, + { + "epoch": 0.6669634906500446, + "grad_norm": 1.6136473417282104, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8518775105476379, + "num_tokens": 199888738.0, + "step": 5243 + }, + { + "epoch": 0.667090700928635, + "grad_norm": 1.5119104385375977, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8689097762107849, + "num_tokens": 199930193.0, + "step": 5244 + }, + { + "epoch": 0.6672179112072255, + "grad_norm": 1.6061674356460571, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8683785200119019, + "num_tokens": 199962504.0, + "step": 5245 + }, + { + "epoch": 0.667345121485816, + "grad_norm": 1.3043428659439087, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8854591846466064, + "num_tokens": 200004669.0, + "step": 5246 + }, + { + "epoch": 0.6674723317644066, + "grad_norm": 1.5312086343765259, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8743356466293335, + "num_tokens": 200038527.0, + "step": 5247 + }, + { + "epoch": 0.6675995420429971, + "grad_norm": 1.5117738246917725, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8632772564888, + "num_tokens": 200073746.0, + "step": 5248 + }, + { + "epoch": 0.6677267523215876, + "grad_norm": 1.4111034870147705, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8802810907363892, + "num_tokens": 200109582.0, + "step": 5249 + }, + { + "epoch": 0.667853962600178, + "grad_norm": 1.5140039920806885, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8698911070823669, + "num_tokens": 200145376.0, + "step": 5250 + }, + { + "epoch": 0.6679811728787686, + "grad_norm": 1.4501954317092896, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8564797043800354, + "num_tokens": 200183903.0, + "step": 5251 + }, + { + "epoch": 0.6681083831573591, + "grad_norm": 1.482897400856018, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8625644445419312, + "num_tokens": 200223713.0, + "step": 5252 + }, + { + "epoch": 0.6682355934359496, + "grad_norm": 1.4969849586486816, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8494937419891357, + "num_tokens": 200265187.0, + "step": 5253 + }, + { + "epoch": 0.6683628037145402, + "grad_norm": 1.5064409971237183, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8648062348365784, + "num_tokens": 200301150.0, + "step": 5254 + }, + { + "epoch": 0.6684900139931307, + "grad_norm": 1.5447585582733154, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8659176826477051, + "num_tokens": 200336147.0, + "step": 5255 + }, + { + "epoch": 0.6686172242717211, + "grad_norm": 1.5137149095535278, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8662457466125488, + "num_tokens": 200371104.0, + "step": 5256 + }, + { + "epoch": 0.6687444345503116, + "grad_norm": 1.4362893104553223, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8660696744918823, + "num_tokens": 200414974.0, + "step": 5257 + }, + { + "epoch": 0.6688716448289022, + "grad_norm": 1.441317081451416, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8716530799865723, + "num_tokens": 200457167.0, + "step": 5258 + }, + { + "epoch": 0.6689988551074927, + "grad_norm": 1.681007742881775, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8732098340988159, + "num_tokens": 200488847.0, + "step": 5259 + }, + { + "epoch": 0.6691260653860832, + "grad_norm": 1.6692723035812378, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.861700713634491, + "num_tokens": 200522254.0, + "step": 5260 + }, + { + "epoch": 0.6692532756646737, + "grad_norm": 1.4099847078323364, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8791592121124268, + "num_tokens": 200559957.0, + "step": 5261 + }, + { + "epoch": 0.6693804859432643, + "grad_norm": 1.6023143529891968, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8401012420654297, + "num_tokens": 200598027.0, + "step": 5262 + }, + { + "epoch": 0.6695076962218547, + "grad_norm": 1.4401061534881592, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8778378963470459, + "num_tokens": 200639280.0, + "step": 5263 + }, + { + "epoch": 0.6696349065004452, + "grad_norm": 1.4276409149169922, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8714755177497864, + "num_tokens": 200679988.0, + "step": 5264 + }, + { + "epoch": 0.6697621167790357, + "grad_norm": 1.5283570289611816, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8578571081161499, + "num_tokens": 200716483.0, + "step": 5265 + }, + { + "epoch": 0.6698893270576263, + "grad_norm": 1.5373636484146118, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8498951196670532, + "num_tokens": 200756609.0, + "step": 5266 + }, + { + "epoch": 0.6700165373362168, + "grad_norm": 1.5102132558822632, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8638489842414856, + "num_tokens": 200795175.0, + "step": 5267 + }, + { + "epoch": 0.6701437476148073, + "grad_norm": 1.4548754692077637, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8493689894676208, + "num_tokens": 200837109.0, + "step": 5268 + }, + { + "epoch": 0.6702709578933977, + "grad_norm": 1.4394060373306274, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8669976592063904, + "num_tokens": 200874285.0, + "step": 5269 + }, + { + "epoch": 0.6703981681719883, + "grad_norm": 1.5401172637939453, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8575021624565125, + "num_tokens": 200911572.0, + "step": 5270 + }, + { + "epoch": 0.6705253784505788, + "grad_norm": 1.3628877401351929, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8712804317474365, + "num_tokens": 200953913.0, + "step": 5271 + }, + { + "epoch": 0.6706525887291693, + "grad_norm": 1.4105772972106934, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8600159287452698, + "num_tokens": 200997004.0, + "step": 5272 + }, + { + "epoch": 0.6707797990077599, + "grad_norm": 1.441534399986267, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8690063953399658, + "num_tokens": 201036786.0, + "step": 5273 + }, + { + "epoch": 0.6709070092863504, + "grad_norm": 1.4911867380142212, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8702115416526794, + "num_tokens": 201072440.0, + "step": 5274 + }, + { + "epoch": 0.6710342195649408, + "grad_norm": 1.5033131837844849, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8622907400131226, + "num_tokens": 201110460.0, + "step": 5275 + }, + { + "epoch": 0.6711614298435313, + "grad_norm": 1.5203938484191895, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8570653796195984, + "num_tokens": 201147849.0, + "step": 5276 + }, + { + "epoch": 0.6712886401221219, + "grad_norm": 1.5154731273651123, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8695908784866333, + "num_tokens": 201183524.0, + "step": 5277 + }, + { + "epoch": 0.6714158504007124, + "grad_norm": 1.5056159496307373, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8605990409851074, + "num_tokens": 201224311.0, + "step": 5278 + }, + { + "epoch": 0.6715430606793029, + "grad_norm": 1.6084307432174683, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8541494607925415, + "num_tokens": 201259641.0, + "step": 5279 + }, + { + "epoch": 0.6716702709578934, + "grad_norm": 1.509236454963684, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8461349010467529, + "num_tokens": 201302187.0, + "step": 5280 + }, + { + "epoch": 0.6717974812364839, + "grad_norm": 1.4593124389648438, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8779627084732056, + "num_tokens": 201339857.0, + "step": 5281 + }, + { + "epoch": 0.6719246915150744, + "grad_norm": 1.4577083587646484, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8558807373046875, + "num_tokens": 201380694.0, + "step": 5282 + }, + { + "epoch": 0.6720519017936649, + "grad_norm": 1.5374701023101807, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.861794650554657, + "num_tokens": 201419128.0, + "step": 5283 + }, + { + "epoch": 0.6721791120722554, + "grad_norm": 1.5513955354690552, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8578476905822754, + "num_tokens": 201456723.0, + "step": 5284 + }, + { + "epoch": 0.672306322350846, + "grad_norm": 1.4777281284332275, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8709163665771484, + "num_tokens": 201497285.0, + "step": 5285 + }, + { + "epoch": 0.6724335326294365, + "grad_norm": 1.4493687152862549, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.867646336555481, + "num_tokens": 201539158.0, + "step": 5286 + }, + { + "epoch": 0.6725607429080269, + "grad_norm": 1.3898590803146362, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8775371313095093, + "num_tokens": 201580250.0, + "step": 5287 + }, + { + "epoch": 0.6726879531866174, + "grad_norm": 1.5602099895477295, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8683857917785645, + "num_tokens": 201621060.0, + "step": 5288 + }, + { + "epoch": 0.672815163465208, + "grad_norm": 1.4800300598144531, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8721027374267578, + "num_tokens": 201659229.0, + "step": 5289 + }, + { + "epoch": 0.6729423737437985, + "grad_norm": 1.461835265159607, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.858670711517334, + "num_tokens": 201702507.0, + "step": 5290 + }, + { + "epoch": 0.673069584022389, + "grad_norm": 1.5648226737976074, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8632200956344604, + "num_tokens": 201738098.0, + "step": 5291 + }, + { + "epoch": 0.6731967943009796, + "grad_norm": 1.6690016984939575, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8649111986160278, + "num_tokens": 201776864.0, + "step": 5292 + }, + { + "epoch": 0.67332400457957, + "grad_norm": 1.4277669191360474, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8581686615943909, + "num_tokens": 201819891.0, + "step": 5293 + }, + { + "epoch": 0.6734512148581605, + "grad_norm": 1.437527060508728, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8775886297225952, + "num_tokens": 201858981.0, + "step": 5294 + }, + { + "epoch": 0.673578425136751, + "grad_norm": 1.5760587453842163, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8606147766113281, + "num_tokens": 201897305.0, + "step": 5295 + }, + { + "epoch": 0.6737056354153416, + "grad_norm": 1.4829490184783936, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8562184572219849, + "num_tokens": 201938904.0, + "step": 5296 + }, + { + "epoch": 0.6738328456939321, + "grad_norm": 1.6704531908035278, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8662582635879517, + "num_tokens": 201969527.0, + "step": 5297 + }, + { + "epoch": 0.6739600559725226, + "grad_norm": 1.6177058219909668, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8606841564178467, + "num_tokens": 202003820.0, + "step": 5298 + }, + { + "epoch": 0.674087266251113, + "grad_norm": 1.5438258647918701, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8515568375587463, + "num_tokens": 202049151.0, + "step": 5299 + }, + { + "epoch": 0.6742144765297036, + "grad_norm": 1.4963228702545166, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8527031540870667, + "num_tokens": 202093848.0, + "step": 5300 + }, + { + "epoch": 0.6743416868082941, + "grad_norm": 1.5845555067062378, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8595994710922241, + "num_tokens": 202129691.0, + "step": 5301 + }, + { + "epoch": 0.6744688970868846, + "grad_norm": 1.433655858039856, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8674004077911377, + "num_tokens": 202175431.0, + "step": 5302 + }, + { + "epoch": 0.6745961073654752, + "grad_norm": 1.4571313858032227, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8639014959335327, + "num_tokens": 202215308.0, + "step": 5303 + }, + { + "epoch": 0.6747233176440657, + "grad_norm": 1.5893745422363281, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8642551898956299, + "num_tokens": 202250235.0, + "step": 5304 + }, + { + "epoch": 0.6748505279226561, + "grad_norm": 1.45638906955719, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8463371992111206, + "num_tokens": 202288535.0, + "step": 5305 + }, + { + "epoch": 0.6749777382012466, + "grad_norm": 1.5450899600982666, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8580875992774963, + "num_tokens": 202324329.0, + "step": 5306 + }, + { + "epoch": 0.6751049484798372, + "grad_norm": 1.4726091623306274, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8653453588485718, + "num_tokens": 202361934.0, + "step": 5307 + }, + { + "epoch": 0.6752321587584277, + "grad_norm": 1.4210189580917358, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8648426532745361, + "num_tokens": 202404400.0, + "step": 5308 + }, + { + "epoch": 0.6753593690370182, + "grad_norm": 1.5151907205581665, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8543963432312012, + "num_tokens": 202445503.0, + "step": 5309 + }, + { + "epoch": 0.6754865793156087, + "grad_norm": 1.516703724861145, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8713939189910889, + "num_tokens": 202485204.0, + "step": 5310 + }, + { + "epoch": 0.6756137895941993, + "grad_norm": 1.4530001878738403, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8639776110649109, + "num_tokens": 202529320.0, + "step": 5311 + }, + { + "epoch": 0.6757409998727897, + "grad_norm": 1.4325731992721558, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.863597571849823, + "num_tokens": 202569917.0, + "step": 5312 + }, + { + "epoch": 0.6758682101513802, + "grad_norm": 1.484562635421753, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8821953535079956, + "num_tokens": 202605222.0, + "step": 5313 + }, + { + "epoch": 0.6759954204299707, + "grad_norm": 1.4662909507751465, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8542184233665466, + "num_tokens": 202647122.0, + "step": 5314 + }, + { + "epoch": 0.6761226307085613, + "grad_norm": 1.6126976013183594, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8574180603027344, + "num_tokens": 202683538.0, + "step": 5315 + }, + { + "epoch": 0.6762498409871518, + "grad_norm": 1.5035547018051147, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8738957643508911, + "num_tokens": 202718490.0, + "step": 5316 + }, + { + "epoch": 0.6763770512657423, + "grad_norm": 1.5154931545257568, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8466753363609314, + "num_tokens": 202759446.0, + "step": 5317 + }, + { + "epoch": 0.6765042615443327, + "grad_norm": 1.5126163959503174, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8687567114830017, + "num_tokens": 202799758.0, + "step": 5318 + }, + { + "epoch": 0.6766314718229233, + "grad_norm": 1.4292608499526978, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8556058406829834, + "num_tokens": 202842002.0, + "step": 5319 + }, + { + "epoch": 0.6767586821015138, + "grad_norm": 1.6554267406463623, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8551772236824036, + "num_tokens": 202874486.0, + "step": 5320 + }, + { + "epoch": 0.6768858923801043, + "grad_norm": 1.577701449394226, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8657423257827759, + "num_tokens": 202905299.0, + "step": 5321 + }, + { + "epoch": 0.6770131026586949, + "grad_norm": 1.5001845359802246, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8822311758995056, + "num_tokens": 202940053.0, + "step": 5322 + }, + { + "epoch": 0.6771403129372854, + "grad_norm": 1.3224029541015625, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8825658559799194, + "num_tokens": 202982430.0, + "step": 5323 + }, + { + "epoch": 0.6772675232158758, + "grad_norm": 1.5035462379455566, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.86571204662323, + "num_tokens": 203021706.0, + "step": 5324 + }, + { + "epoch": 0.6773947334944663, + "grad_norm": 1.6203874349594116, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8602579832077026, + "num_tokens": 203060911.0, + "step": 5325 + }, + { + "epoch": 0.6775219437730569, + "grad_norm": 1.5237528085708618, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8740374445915222, + "num_tokens": 203099604.0, + "step": 5326 + }, + { + "epoch": 0.6776491540516474, + "grad_norm": 1.4162882566452026, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8586587905883789, + "num_tokens": 203147190.0, + "step": 5327 + }, + { + "epoch": 0.6777763643302379, + "grad_norm": 1.5808137655258179, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.872695803642273, + "num_tokens": 203181716.0, + "step": 5328 + }, + { + "epoch": 0.6779035746088284, + "grad_norm": 1.5401819944381714, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8564497828483582, + "num_tokens": 203223773.0, + "step": 5329 + }, + { + "epoch": 0.6780307848874189, + "grad_norm": 1.5921542644500732, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8581834435462952, + "num_tokens": 203260331.0, + "step": 5330 + }, + { + "epoch": 0.6781579951660094, + "grad_norm": 1.5420140027999878, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8628774881362915, + "num_tokens": 203300150.0, + "step": 5331 + }, + { + "epoch": 0.6782852054445999, + "grad_norm": 1.421051263809204, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8703756332397461, + "num_tokens": 203341697.0, + "step": 5332 + }, + { + "epoch": 0.6784124157231904, + "grad_norm": 1.5122947692871094, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8514724969863892, + "num_tokens": 203383015.0, + "step": 5333 + }, + { + "epoch": 0.678539626001781, + "grad_norm": 1.5690606832504272, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8476911783218384, + "num_tokens": 203422633.0, + "step": 5334 + }, + { + "epoch": 0.6786668362803715, + "grad_norm": 1.5565508604049683, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8741779327392578, + "num_tokens": 203458744.0, + "step": 5335 + }, + { + "epoch": 0.6787940465589619, + "grad_norm": 1.757308006286621, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8567744493484497, + "num_tokens": 203492856.0, + "step": 5336 + }, + { + "epoch": 0.6789212568375524, + "grad_norm": 1.6196399927139282, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8651532530784607, + "num_tokens": 203525524.0, + "step": 5337 + }, + { + "epoch": 0.679048467116143, + "grad_norm": 1.4312312602996826, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8626359701156616, + "num_tokens": 203563980.0, + "step": 5338 + }, + { + "epoch": 0.6791756773947335, + "grad_norm": 1.4081194400787354, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8726907968521118, + "num_tokens": 203604729.0, + "step": 5339 + }, + { + "epoch": 0.679302887673324, + "grad_norm": 1.4380662441253662, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8740487098693848, + "num_tokens": 203645678.0, + "step": 5340 + }, + { + "epoch": 0.6794300979519146, + "grad_norm": 1.4079207181930542, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8681236505508423, + "num_tokens": 203689783.0, + "step": 5341 + }, + { + "epoch": 0.679557308230505, + "grad_norm": 1.6044667959213257, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8661304712295532, + "num_tokens": 203721609.0, + "step": 5342 + }, + { + "epoch": 0.6796845185090955, + "grad_norm": 1.745198130607605, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8584885001182556, + "num_tokens": 203750605.0, + "step": 5343 + }, + { + "epoch": 0.679811728787686, + "grad_norm": 1.3710079193115234, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8791102766990662, + "num_tokens": 203795751.0, + "step": 5344 + }, + { + "epoch": 0.6799389390662766, + "grad_norm": 1.4611424207687378, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.868110179901123, + "num_tokens": 203837722.0, + "step": 5345 + }, + { + "epoch": 0.6800661493448671, + "grad_norm": 1.5585752725601196, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8734325766563416, + "num_tokens": 203872722.0, + "step": 5346 + }, + { + "epoch": 0.6801933596234576, + "grad_norm": 1.5308877229690552, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.861862301826477, + "num_tokens": 203910313.0, + "step": 5347 + }, + { + "epoch": 0.680320569902048, + "grad_norm": 1.556591510772705, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8623949289321899, + "num_tokens": 203949253.0, + "step": 5348 + }, + { + "epoch": 0.6804477801806386, + "grad_norm": 1.4738506078720093, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8615307211875916, + "num_tokens": 203989647.0, + "step": 5349 + }, + { + "epoch": 0.6805749904592291, + "grad_norm": 1.5557066202163696, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8618118762969971, + "num_tokens": 204027454.0, + "step": 5350 + }, + { + "epoch": 0.6807022007378196, + "grad_norm": 1.4933888912200928, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8680223226547241, + "num_tokens": 204068819.0, + "step": 5351 + }, + { + "epoch": 0.6808294110164101, + "grad_norm": 1.663078784942627, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.863976001739502, + "num_tokens": 204101355.0, + "step": 5352 + }, + { + "epoch": 0.6809566212950007, + "grad_norm": 1.468390941619873, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8557882308959961, + "num_tokens": 204142413.0, + "step": 5353 + }, + { + "epoch": 0.6810838315735911, + "grad_norm": 1.5384458303451538, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8593393564224243, + "num_tokens": 204179311.0, + "step": 5354 + }, + { + "epoch": 0.6812110418521816, + "grad_norm": 1.495833158493042, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8505154848098755, + "num_tokens": 204219682.0, + "step": 5355 + }, + { + "epoch": 0.6813382521307721, + "grad_norm": 1.4870911836624146, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8684689998626709, + "num_tokens": 204257457.0, + "step": 5356 + }, + { + "epoch": 0.6814654624093627, + "grad_norm": 1.6458314657211304, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8674381971359253, + "num_tokens": 204290737.0, + "step": 5357 + }, + { + "epoch": 0.6815926726879532, + "grad_norm": 1.609700083732605, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.862474799156189, + "num_tokens": 204324206.0, + "step": 5358 + }, + { + "epoch": 0.6817198829665437, + "grad_norm": 1.4712798595428467, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8656494617462158, + "num_tokens": 204362036.0, + "step": 5359 + }, + { + "epoch": 0.6818470932451343, + "grad_norm": 1.6147196292877197, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8493884801864624, + "num_tokens": 204399629.0, + "step": 5360 + }, + { + "epoch": 0.6819743035237247, + "grad_norm": 1.4908256530761719, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8634042739868164, + "num_tokens": 204439292.0, + "step": 5361 + }, + { + "epoch": 0.6821015138023152, + "grad_norm": 1.507738709449768, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8855910897254944, + "num_tokens": 204476590.0, + "step": 5362 + }, + { + "epoch": 0.6822287240809057, + "grad_norm": 1.50091552734375, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.86330646276474, + "num_tokens": 204516017.0, + "step": 5363 + }, + { + "epoch": 0.6823559343594963, + "grad_norm": 1.5253422260284424, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8649272322654724, + "num_tokens": 204553753.0, + "step": 5364 + }, + { + "epoch": 0.6824831446380868, + "grad_norm": 1.457621693611145, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8570241928100586, + "num_tokens": 204598816.0, + "step": 5365 + }, + { + "epoch": 0.6826103549166773, + "grad_norm": 1.52491295337677, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8631633520126343, + "num_tokens": 204636476.0, + "step": 5366 + }, + { + "epoch": 0.6827375651952677, + "grad_norm": 1.4036893844604492, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8696435689926147, + "num_tokens": 204676532.0, + "step": 5367 + }, + { + "epoch": 0.6828647754738583, + "grad_norm": 1.487206220626831, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8554425835609436, + "num_tokens": 204716628.0, + "step": 5368 + }, + { + "epoch": 0.6829919857524488, + "grad_norm": 1.4280726909637451, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8706405758857727, + "num_tokens": 204755090.0, + "step": 5369 + }, + { + "epoch": 0.6831191960310393, + "grad_norm": 1.5858455896377563, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8715243935585022, + "num_tokens": 204788745.0, + "step": 5370 + }, + { + "epoch": 0.6832464063096299, + "grad_norm": 1.614071011543274, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8719368577003479, + "num_tokens": 204823338.0, + "step": 5371 + }, + { + "epoch": 0.6833736165882204, + "grad_norm": 1.3787893056869507, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8678787350654602, + "num_tokens": 204863024.0, + "step": 5372 + }, + { + "epoch": 0.6835008268668108, + "grad_norm": 1.440262794494629, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8661491274833679, + "num_tokens": 204905414.0, + "step": 5373 + }, + { + "epoch": 0.6836280371454013, + "grad_norm": 1.552204966545105, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8551468849182129, + "num_tokens": 204946960.0, + "step": 5374 + }, + { + "epoch": 0.6837552474239919, + "grad_norm": 1.456965684890747, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8700160384178162, + "num_tokens": 204986615.0, + "step": 5375 + }, + { + "epoch": 0.6838824577025824, + "grad_norm": 1.6707727909088135, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8596247434616089, + "num_tokens": 205019852.0, + "step": 5376 + }, + { + "epoch": 0.6840096679811729, + "grad_norm": 1.5353751182556152, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8452009558677673, + "num_tokens": 205060861.0, + "step": 5377 + }, + { + "epoch": 0.6841368782597634, + "grad_norm": 1.5397378206253052, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8661890029907227, + "num_tokens": 205099046.0, + "step": 5378 + }, + { + "epoch": 0.6842640885383539, + "grad_norm": 1.522283673286438, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8560632467269897, + "num_tokens": 205141709.0, + "step": 5379 + }, + { + "epoch": 0.6843912988169444, + "grad_norm": 1.5032790899276733, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8679037094116211, + "num_tokens": 205179025.0, + "step": 5380 + }, + { + "epoch": 0.6845185090955349, + "grad_norm": 1.489540696144104, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8613013029098511, + "num_tokens": 205219524.0, + "step": 5381 + }, + { + "epoch": 0.6846457193741254, + "grad_norm": 1.509850263595581, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8720753192901611, + "num_tokens": 205254717.0, + "step": 5382 + }, + { + "epoch": 0.684772929652716, + "grad_norm": 1.3990225791931152, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8538244962692261, + "num_tokens": 205301739.0, + "step": 5383 + }, + { + "epoch": 0.6849001399313065, + "grad_norm": 1.4104329347610474, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8795690536499023, + "num_tokens": 205339815.0, + "step": 5384 + }, + { + "epoch": 0.6850273502098969, + "grad_norm": 1.5114891529083252, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.854468822479248, + "num_tokens": 205378594.0, + "step": 5385 + }, + { + "epoch": 0.6851545604884874, + "grad_norm": 1.4651954174041748, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8584818243980408, + "num_tokens": 205419922.0, + "step": 5386 + }, + { + "epoch": 0.685281770767078, + "grad_norm": 1.373417615890503, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8869081735610962, + "num_tokens": 205459738.0, + "step": 5387 + }, + { + "epoch": 0.6854089810456685, + "grad_norm": 1.4650492668151855, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8875106573104858, + "num_tokens": 205497358.0, + "step": 5388 + }, + { + "epoch": 0.685536191324259, + "grad_norm": 1.520789384841919, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8696074485778809, + "num_tokens": 205534079.0, + "step": 5389 + }, + { + "epoch": 0.6856634016028496, + "grad_norm": 1.5570950508117676, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8661010265350342, + "num_tokens": 205571925.0, + "step": 5390 + }, + { + "epoch": 0.68579061188144, + "grad_norm": 1.3980622291564941, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8607640862464905, + "num_tokens": 205615363.0, + "step": 5391 + }, + { + "epoch": 0.6859178221600305, + "grad_norm": 1.5176540613174438, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8681439757347107, + "num_tokens": 205652477.0, + "step": 5392 + }, + { + "epoch": 0.686045032438621, + "grad_norm": 1.5107804536819458, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8878412246704102, + "num_tokens": 205688276.0, + "step": 5393 + }, + { + "epoch": 0.6861722427172116, + "grad_norm": 1.5183653831481934, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8791357278823853, + "num_tokens": 205723432.0, + "step": 5394 + }, + { + "epoch": 0.6862994529958021, + "grad_norm": 1.5361413955688477, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8780617713928223, + "num_tokens": 205758051.0, + "step": 5395 + }, + { + "epoch": 0.6864266632743926, + "grad_norm": 1.4613569974899292, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8765528202056885, + "num_tokens": 205795643.0, + "step": 5396 + }, + { + "epoch": 0.686553873552983, + "grad_norm": 1.643776535987854, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8423247337341309, + "num_tokens": 205833087.0, + "step": 5397 + }, + { + "epoch": 0.6866810838315736, + "grad_norm": 1.5552011728286743, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8514735102653503, + "num_tokens": 205871481.0, + "step": 5398 + }, + { + "epoch": 0.6868082941101641, + "grad_norm": 1.6762046813964844, + "learning_rate": 1e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.8350649476051331, + "num_tokens": 205908491.0, + "step": 5399 + }, + { + "epoch": 0.6869355043887546, + "grad_norm": 1.39064621925354, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8726262450218201, + "num_tokens": 205946386.0, + "step": 5400 + }, + { + "epoch": 0.6870627146673451, + "grad_norm": 1.479469895362854, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8535094261169434, + "num_tokens": 205986856.0, + "step": 5401 + }, + { + "epoch": 0.6871899249459357, + "grad_norm": 1.483751654624939, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8672923445701599, + "num_tokens": 206025708.0, + "step": 5402 + }, + { + "epoch": 0.6873171352245261, + "grad_norm": 1.48283052444458, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8694735169410706, + "num_tokens": 206062588.0, + "step": 5403 + }, + { + "epoch": 0.6874443455031166, + "grad_norm": 1.4533582925796509, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8711032867431641, + "num_tokens": 206103393.0, + "step": 5404 + }, + { + "epoch": 0.6875715557817071, + "grad_norm": 1.418756127357483, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8740382194519043, + "num_tokens": 206146923.0, + "step": 5405 + }, + { + "epoch": 0.6876987660602977, + "grad_norm": 1.619879961013794, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8752419948577881, + "num_tokens": 206179093.0, + "step": 5406 + }, + { + "epoch": 0.6878259763388882, + "grad_norm": 1.6470794677734375, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.856543779373169, + "num_tokens": 206213198.0, + "step": 5407 + }, + { + "epoch": 0.6879531866174787, + "grad_norm": 1.6126152276992798, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8479417562484741, + "num_tokens": 206250273.0, + "step": 5408 + }, + { + "epoch": 0.6880803968960693, + "grad_norm": 1.5652016401290894, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8639616370201111, + "num_tokens": 206286711.0, + "step": 5409 + }, + { + "epoch": 0.6882076071746597, + "grad_norm": 1.420986294746399, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8798218369483948, + "num_tokens": 206323572.0, + "step": 5410 + }, + { + "epoch": 0.6883348174532502, + "grad_norm": 1.56619393825531, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8778349757194519, + "num_tokens": 206355974.0, + "step": 5411 + }, + { + "epoch": 0.6884620277318407, + "grad_norm": 1.6513880491256714, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8526010513305664, + "num_tokens": 206390570.0, + "step": 5412 + }, + { + "epoch": 0.6885892380104313, + "grad_norm": 1.4223947525024414, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8842177391052246, + "num_tokens": 206425183.0, + "step": 5413 + }, + { + "epoch": 0.6887164482890218, + "grad_norm": 1.4661670923233032, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8751645088195801, + "num_tokens": 206461954.0, + "step": 5414 + }, + { + "epoch": 0.6888436585676123, + "grad_norm": 1.4667774438858032, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8685207962989807, + "num_tokens": 206500917.0, + "step": 5415 + }, + { + "epoch": 0.6889708688462027, + "grad_norm": 1.6793437004089355, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8424440622329712, + "num_tokens": 206535583.0, + "step": 5416 + }, + { + "epoch": 0.6890980791247933, + "grad_norm": 1.5038959980010986, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8675838708877563, + "num_tokens": 206572587.0, + "step": 5417 + }, + { + "epoch": 0.6892252894033838, + "grad_norm": 1.5796207189559937, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8516806364059448, + "num_tokens": 206609661.0, + "step": 5418 + }, + { + "epoch": 0.6893524996819743, + "grad_norm": 1.631524682044983, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.847395122051239, + "num_tokens": 206646812.0, + "step": 5419 + }, + { + "epoch": 0.6894797099605648, + "grad_norm": 1.4180048704147339, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8761764168739319, + "num_tokens": 206685060.0, + "step": 5420 + }, + { + "epoch": 0.6896069202391554, + "grad_norm": 1.6311898231506348, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8663625121116638, + "num_tokens": 206721626.0, + "step": 5421 + }, + { + "epoch": 0.6897341305177458, + "grad_norm": 1.4449162483215332, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8543882966041565, + "num_tokens": 206767972.0, + "step": 5422 + }, + { + "epoch": 0.6898613407963363, + "grad_norm": 1.5169401168823242, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8723268508911133, + "num_tokens": 206807150.0, + "step": 5423 + }, + { + "epoch": 0.6899885510749268, + "grad_norm": 1.4739389419555664, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8680254817008972, + "num_tokens": 206851097.0, + "step": 5424 + }, + { + "epoch": 0.6901157613535174, + "grad_norm": 1.638503909111023, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8757227659225464, + "num_tokens": 206885996.0, + "step": 5425 + }, + { + "epoch": 0.6902429716321079, + "grad_norm": 1.5513591766357422, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8763479590415955, + "num_tokens": 206920922.0, + "step": 5426 + }, + { + "epoch": 0.6903701819106984, + "grad_norm": 1.383190631866455, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8714706301689148, + "num_tokens": 206961666.0, + "step": 5427 + }, + { + "epoch": 0.6904973921892888, + "grad_norm": 1.5684202909469604, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8694941997528076, + "num_tokens": 207001979.0, + "step": 5428 + }, + { + "epoch": 0.6906246024678794, + "grad_norm": 1.5645136833190918, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8688685297966003, + "num_tokens": 207039810.0, + "step": 5429 + }, + { + "epoch": 0.6907518127464699, + "grad_norm": 1.5875319242477417, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8601070642471313, + "num_tokens": 207075966.0, + "step": 5430 + }, + { + "epoch": 0.6908790230250604, + "grad_norm": 1.3793871402740479, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8720558881759644, + "num_tokens": 207118423.0, + "step": 5431 + }, + { + "epoch": 0.691006233303651, + "grad_norm": 1.4045281410217285, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.881460964679718, + "num_tokens": 207154755.0, + "step": 5432 + }, + { + "epoch": 0.6911334435822415, + "grad_norm": 1.4912923574447632, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8605774641036987, + "num_tokens": 207193965.0, + "step": 5433 + }, + { + "epoch": 0.6912606538608319, + "grad_norm": 1.6166576147079468, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8649072647094727, + "num_tokens": 207234527.0, + "step": 5434 + }, + { + "epoch": 0.6913878641394224, + "grad_norm": 1.545493245124817, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8516051173210144, + "num_tokens": 207273998.0, + "step": 5435 + }, + { + "epoch": 0.691515074418013, + "grad_norm": 1.4481345415115356, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.859422504901886, + "num_tokens": 207318783.0, + "step": 5436 + }, + { + "epoch": 0.6916422846966035, + "grad_norm": 1.399611473083496, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8747897148132324, + "num_tokens": 207358403.0, + "step": 5437 + }, + { + "epoch": 0.691769494975194, + "grad_norm": 1.4412484169006348, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8673641681671143, + "num_tokens": 207397631.0, + "step": 5438 + }, + { + "epoch": 0.6918967052537845, + "grad_norm": 1.4160548448562622, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8690144419670105, + "num_tokens": 207441651.0, + "step": 5439 + }, + { + "epoch": 0.692023915532375, + "grad_norm": 1.360671877861023, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8793801069259644, + "num_tokens": 207483235.0, + "step": 5440 + }, + { + "epoch": 0.6921511258109655, + "grad_norm": 1.559050440788269, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8771986365318298, + "num_tokens": 207521871.0, + "step": 5441 + }, + { + "epoch": 0.692278336089556, + "grad_norm": 1.4107048511505127, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8726484179496765, + "num_tokens": 207561717.0, + "step": 5442 + }, + { + "epoch": 0.6924055463681466, + "grad_norm": 1.3513656854629517, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8746588826179504, + "num_tokens": 207603684.0, + "step": 5443 + }, + { + "epoch": 0.6925327566467371, + "grad_norm": 1.5925325155258179, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8686893582344055, + "num_tokens": 207638250.0, + "step": 5444 + }, + { + "epoch": 0.6926599669253276, + "grad_norm": 1.3926455974578857, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8865231275558472, + "num_tokens": 207675289.0, + "step": 5445 + }, + { + "epoch": 0.692787177203918, + "grad_norm": 1.51554274559021, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8683457374572754, + "num_tokens": 207710926.0, + "step": 5446 + }, + { + "epoch": 0.6929143874825086, + "grad_norm": 1.5080822706222534, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8691949248313904, + "num_tokens": 207747965.0, + "step": 5447 + }, + { + "epoch": 0.6930415977610991, + "grad_norm": 1.469429850578308, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8725495338439941, + "num_tokens": 207785059.0, + "step": 5448 + }, + { + "epoch": 0.6931688080396896, + "grad_norm": 1.5366389751434326, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8552844524383545, + "num_tokens": 207823511.0, + "step": 5449 + }, + { + "epoch": 0.6932960183182801, + "grad_norm": 1.5181254148483276, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8662838339805603, + "num_tokens": 207860761.0, + "step": 5450 + }, + { + "epoch": 0.6934232285968707, + "grad_norm": 1.565171241760254, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.866881251335144, + "num_tokens": 207895901.0, + "step": 5451 + }, + { + "epoch": 0.6935504388754611, + "grad_norm": 1.4856619834899902, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8534079194068909, + "num_tokens": 207940428.0, + "step": 5452 + }, + { + "epoch": 0.6936776491540516, + "grad_norm": 1.3711172342300415, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.858773946762085, + "num_tokens": 207983137.0, + "step": 5453 + }, + { + "epoch": 0.6938048594326421, + "grad_norm": 1.4829312562942505, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.875394880771637, + "num_tokens": 208018425.0, + "step": 5454 + }, + { + "epoch": 0.6939320697112327, + "grad_norm": 1.3730194568634033, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8608905076980591, + "num_tokens": 208061474.0, + "step": 5455 + }, + { + "epoch": 0.6940592799898232, + "grad_norm": 1.4449174404144287, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8644816279411316, + "num_tokens": 208097963.0, + "step": 5456 + }, + { + "epoch": 0.6941864902684137, + "grad_norm": 1.7265760898590088, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8637422323226929, + "num_tokens": 208130920.0, + "step": 5457 + }, + { + "epoch": 0.6943137005470043, + "grad_norm": 1.4528337717056274, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8608546257019043, + "num_tokens": 208170071.0, + "step": 5458 + }, + { + "epoch": 0.6944409108255947, + "grad_norm": 1.4825835227966309, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8580976724624634, + "num_tokens": 208208584.0, + "step": 5459 + }, + { + "epoch": 0.6945681211041852, + "grad_norm": 1.846207618713379, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8648384809494019, + "num_tokens": 208234037.0, + "step": 5460 + }, + { + "epoch": 0.6946953313827757, + "grad_norm": 1.4752678871154785, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8590075969696045, + "num_tokens": 208273732.0, + "step": 5461 + }, + { + "epoch": 0.6948225416613663, + "grad_norm": 1.4066411256790161, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8834261298179626, + "num_tokens": 208314661.0, + "step": 5462 + }, + { + "epoch": 0.6949497519399568, + "grad_norm": 1.365645408630371, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8531239032745361, + "num_tokens": 208360171.0, + "step": 5463 + }, + { + "epoch": 0.6950769622185473, + "grad_norm": 1.5877513885498047, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8610108494758606, + "num_tokens": 208395665.0, + "step": 5464 + }, + { + "epoch": 0.6952041724971377, + "grad_norm": 1.6015527248382568, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8480098247528076, + "num_tokens": 208436285.0, + "step": 5465 + }, + { + "epoch": 0.6953313827757283, + "grad_norm": 1.4944769144058228, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8726524710655212, + "num_tokens": 208471061.0, + "step": 5466 + }, + { + "epoch": 0.6954585930543188, + "grad_norm": 1.3455970287322998, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8683231472969055, + "num_tokens": 208515527.0, + "step": 5467 + }, + { + "epoch": 0.6955858033329093, + "grad_norm": 1.566039800643921, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8541551828384399, + "num_tokens": 208554171.0, + "step": 5468 + }, + { + "epoch": 0.6957130136114998, + "grad_norm": 1.6905723810195923, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8560489416122437, + "num_tokens": 208585986.0, + "step": 5469 + }, + { + "epoch": 0.6958402238900904, + "grad_norm": 1.4780975580215454, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8780686855316162, + "num_tokens": 208623066.0, + "step": 5470 + }, + { + "epoch": 0.6959674341686808, + "grad_norm": 1.3722805976867676, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8749477863311768, + "num_tokens": 208666229.0, + "step": 5471 + }, + { + "epoch": 0.6960946444472713, + "grad_norm": 1.44137442111969, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8779184222221375, + "num_tokens": 208705113.0, + "step": 5472 + }, + { + "epoch": 0.6962218547258618, + "grad_norm": 1.479373812675476, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8711034655570984, + "num_tokens": 208744141.0, + "step": 5473 + }, + { + "epoch": 0.6963490650044524, + "grad_norm": 1.6115912199020386, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8737467527389526, + "num_tokens": 208781183.0, + "step": 5474 + }, + { + "epoch": 0.6964762752830429, + "grad_norm": 1.4644228219985962, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.868698000907898, + "num_tokens": 208820022.0, + "step": 5475 + }, + { + "epoch": 0.6966034855616334, + "grad_norm": 1.7273932695388794, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8537771105766296, + "num_tokens": 208852682.0, + "step": 5476 + }, + { + "epoch": 0.6967306958402238, + "grad_norm": 1.664923906326294, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8725254535675049, + "num_tokens": 208883593.0, + "step": 5477 + }, + { + "epoch": 0.6968579061188144, + "grad_norm": 1.4905096292495728, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8512101173400879, + "num_tokens": 208925991.0, + "step": 5478 + }, + { + "epoch": 0.6969851163974049, + "grad_norm": 1.5337578058242798, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8558101654052734, + "num_tokens": 208963007.0, + "step": 5479 + }, + { + "epoch": 0.6971123266759954, + "grad_norm": 1.4941251277923584, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.866468071937561, + "num_tokens": 208999512.0, + "step": 5480 + }, + { + "epoch": 0.697239536954586, + "grad_norm": 1.5775467157363892, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8642932176589966, + "num_tokens": 209034205.0, + "step": 5481 + }, + { + "epoch": 0.6973667472331765, + "grad_norm": 1.4854077100753784, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8655415773391724, + "num_tokens": 209069875.0, + "step": 5482 + }, + { + "epoch": 0.6974939575117669, + "grad_norm": 1.5653823614120483, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8627377152442932, + "num_tokens": 209109410.0, + "step": 5483 + }, + { + "epoch": 0.6976211677903574, + "grad_norm": 1.5348681211471558, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8698534965515137, + "num_tokens": 209143219.0, + "step": 5484 + }, + { + "epoch": 0.697748378068948, + "grad_norm": 1.5906171798706055, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8730803728103638, + "num_tokens": 209176529.0, + "step": 5485 + }, + { + "epoch": 0.6978755883475385, + "grad_norm": 1.4566220045089722, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.872118353843689, + "num_tokens": 209217090.0, + "step": 5486 + }, + { + "epoch": 0.698002798626129, + "grad_norm": 1.4429199695587158, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8754346370697021, + "num_tokens": 209256511.0, + "step": 5487 + }, + { + "epoch": 0.6981300089047195, + "grad_norm": 1.4595648050308228, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8558238744735718, + "num_tokens": 209296438.0, + "step": 5488 + }, + { + "epoch": 0.69825721918331, + "grad_norm": 1.606932282447815, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8744750618934631, + "num_tokens": 209331570.0, + "step": 5489 + }, + { + "epoch": 0.6983844294619005, + "grad_norm": 1.677240014076233, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8670125603675842, + "num_tokens": 209365295.0, + "step": 5490 + }, + { + "epoch": 0.698511639740491, + "grad_norm": 1.485365629196167, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8827834725379944, + "num_tokens": 209400401.0, + "step": 5491 + }, + { + "epoch": 0.6986388500190815, + "grad_norm": 1.492143988609314, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8605282306671143, + "num_tokens": 209437086.0, + "step": 5492 + }, + { + "epoch": 0.6987660602976721, + "grad_norm": 1.5799931287765503, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8589009642601013, + "num_tokens": 209476186.0, + "step": 5493 + }, + { + "epoch": 0.6988932705762626, + "grad_norm": 1.5311607122421265, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8680793046951294, + "num_tokens": 209511390.0, + "step": 5494 + }, + { + "epoch": 0.699020480854853, + "grad_norm": 1.3908004760742188, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.870662271976471, + "num_tokens": 209554814.0, + "step": 5495 + }, + { + "epoch": 0.6991476911334435, + "grad_norm": 1.470357894897461, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.862548828125, + "num_tokens": 209599144.0, + "step": 5496 + }, + { + "epoch": 0.6992749014120341, + "grad_norm": 1.7365400791168213, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.836238443851471, + "num_tokens": 209633816.0, + "step": 5497 + }, + { + "epoch": 0.6994021116906246, + "grad_norm": 1.5133024454116821, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8685299158096313, + "num_tokens": 209670239.0, + "step": 5498 + }, + { + "epoch": 0.6995293219692151, + "grad_norm": 1.4525916576385498, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8597916960716248, + "num_tokens": 209710613.0, + "step": 5499 + }, + { + "epoch": 0.6996565322478057, + "grad_norm": 1.54697585105896, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8674265146255493, + "num_tokens": 209745839.0, + "step": 5500 + }, + { + "epoch": 0.6997837425263961, + "grad_norm": 1.585578441619873, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8618154525756836, + "num_tokens": 209781845.0, + "step": 5501 + }, + { + "epoch": 0.6999109528049866, + "grad_norm": 1.5590258836746216, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8794574737548828, + "num_tokens": 209817792.0, + "step": 5502 + }, + { + "epoch": 0.7000381630835771, + "grad_norm": 1.4698622226715088, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.878446102142334, + "num_tokens": 209853178.0, + "step": 5503 + }, + { + "epoch": 0.7001653733621677, + "grad_norm": 1.4749388694763184, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8655942678451538, + "num_tokens": 209891684.0, + "step": 5504 + }, + { + "epoch": 0.7002925836407582, + "grad_norm": 1.487324595451355, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8529152274131775, + "num_tokens": 209930724.0, + "step": 5505 + }, + { + "epoch": 0.7004197939193487, + "grad_norm": 1.5975537300109863, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8717149496078491, + "num_tokens": 209967906.0, + "step": 5506 + }, + { + "epoch": 0.7005470041979391, + "grad_norm": 1.6436939239501953, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8794394731521606, + "num_tokens": 210002347.0, + "step": 5507 + }, + { + "epoch": 0.7006742144765297, + "grad_norm": 1.442576289176941, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.867595374584198, + "num_tokens": 210042188.0, + "step": 5508 + }, + { + "epoch": 0.7008014247551202, + "grad_norm": 1.4362528324127197, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8579244613647461, + "num_tokens": 210082589.0, + "step": 5509 + }, + { + "epoch": 0.7009286350337107, + "grad_norm": 1.4915212392807007, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8664426803588867, + "num_tokens": 210119735.0, + "step": 5510 + }, + { + "epoch": 0.7010558453123013, + "grad_norm": 1.5021365880966187, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.870985209941864, + "num_tokens": 210155825.0, + "step": 5511 + }, + { + "epoch": 0.7011830555908918, + "grad_norm": 1.6441850662231445, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8720986843109131, + "num_tokens": 210188608.0, + "step": 5512 + }, + { + "epoch": 0.7013102658694823, + "grad_norm": 1.5169754028320312, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8603197336196899, + "num_tokens": 210225230.0, + "step": 5513 + }, + { + "epoch": 0.7014374761480727, + "grad_norm": 1.5264558792114258, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8693878054618835, + "num_tokens": 210265263.0, + "step": 5514 + }, + { + "epoch": 0.7015646864266633, + "grad_norm": 1.449454426765442, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8662910461425781, + "num_tokens": 210302929.0, + "step": 5515 + }, + { + "epoch": 0.7016918967052538, + "grad_norm": 1.3228543996810913, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8865480422973633, + "num_tokens": 210344903.0, + "step": 5516 + }, + { + "epoch": 0.7018191069838443, + "grad_norm": 1.4208546876907349, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8706886172294617, + "num_tokens": 210387670.0, + "step": 5517 + }, + { + "epoch": 0.7019463172624348, + "grad_norm": 1.4354008436203003, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8826966285705566, + "num_tokens": 210428809.0, + "step": 5518 + }, + { + "epoch": 0.7020735275410254, + "grad_norm": 1.6343284845352173, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.845966100692749, + "num_tokens": 210464467.0, + "step": 5519 + }, + { + "epoch": 0.7022007378196158, + "grad_norm": 1.4598402976989746, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.871397852897644, + "num_tokens": 210502971.0, + "step": 5520 + }, + { + "epoch": 0.7023279480982063, + "grad_norm": 1.4546911716461182, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8673489689826965, + "num_tokens": 210542593.0, + "step": 5521 + }, + { + "epoch": 0.7024551583767968, + "grad_norm": 1.4435498714447021, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8691415190696716, + "num_tokens": 210582123.0, + "step": 5522 + }, + { + "epoch": 0.7025823686553874, + "grad_norm": 1.4393975734710693, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8822547197341919, + "num_tokens": 210617702.0, + "step": 5523 + }, + { + "epoch": 0.7027095789339779, + "grad_norm": 1.6160365343093872, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8643007278442383, + "num_tokens": 210652723.0, + "step": 5524 + }, + { + "epoch": 0.7028367892125684, + "grad_norm": 1.4539035558700562, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8825518488883972, + "num_tokens": 210689891.0, + "step": 5525 + }, + { + "epoch": 0.7029639994911588, + "grad_norm": 1.4365123510360718, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8661895990371704, + "num_tokens": 210735908.0, + "step": 5526 + }, + { + "epoch": 0.7030912097697494, + "grad_norm": 1.4502471685409546, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8845798373222351, + "num_tokens": 210777278.0, + "step": 5527 + }, + { + "epoch": 0.7032184200483399, + "grad_norm": 1.4416539669036865, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8775632381439209, + "num_tokens": 210815867.0, + "step": 5528 + }, + { + "epoch": 0.7033456303269304, + "grad_norm": 1.5564326047897339, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8628062605857849, + "num_tokens": 210856526.0, + "step": 5529 + }, + { + "epoch": 0.703472840605521, + "grad_norm": 1.5168256759643555, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8614389896392822, + "num_tokens": 210897077.0, + "step": 5530 + }, + { + "epoch": 0.7036000508841115, + "grad_norm": 1.5863614082336426, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8520591259002686, + "num_tokens": 210932076.0, + "step": 5531 + }, + { + "epoch": 0.7037272611627019, + "grad_norm": 1.466356873512268, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8784263730049133, + "num_tokens": 210968446.0, + "step": 5532 + }, + { + "epoch": 0.7038544714412924, + "grad_norm": 1.4631742238998413, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8612810373306274, + "num_tokens": 211008821.0, + "step": 5533 + }, + { + "epoch": 0.703981681719883, + "grad_norm": 1.5707978010177612, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8640070557594299, + "num_tokens": 211044451.0, + "step": 5534 + }, + { + "epoch": 0.7041088919984735, + "grad_norm": 1.6453303098678589, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8740295767784119, + "num_tokens": 211078360.0, + "step": 5535 + }, + { + "epoch": 0.704236102277064, + "grad_norm": 1.4743506908416748, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8708646893501282, + "num_tokens": 211114297.0, + "step": 5536 + }, + { + "epoch": 0.7043633125556545, + "grad_norm": 1.5829511880874634, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.87067711353302, + "num_tokens": 211147443.0, + "step": 5537 + }, + { + "epoch": 0.704490522834245, + "grad_norm": 1.4717919826507568, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8724014759063721, + "num_tokens": 211187813.0, + "step": 5538 + }, + { + "epoch": 0.7046177331128355, + "grad_norm": 1.6252357959747314, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8547909259796143, + "num_tokens": 211226187.0, + "step": 5539 + }, + { + "epoch": 0.704744943391426, + "grad_norm": 1.4747132062911987, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8674237728118896, + "num_tokens": 211263553.0, + "step": 5540 + }, + { + "epoch": 0.7048721536700165, + "grad_norm": 1.6734728813171387, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8549565076828003, + "num_tokens": 211296768.0, + "step": 5541 + }, + { + "epoch": 0.7049993639486071, + "grad_norm": 1.5770273208618164, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8709255456924438, + "num_tokens": 211332212.0, + "step": 5542 + }, + { + "epoch": 0.7051265742271976, + "grad_norm": 1.4084804058074951, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8636847138404846, + "num_tokens": 211373490.0, + "step": 5543 + }, + { + "epoch": 0.705253784505788, + "grad_norm": 1.4911746978759766, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8742988705635071, + "num_tokens": 211411045.0, + "step": 5544 + }, + { + "epoch": 0.7053809947843785, + "grad_norm": 1.523738145828247, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8787907361984253, + "num_tokens": 211444769.0, + "step": 5545 + }, + { + "epoch": 0.7055082050629691, + "grad_norm": 1.477536678314209, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8845177292823792, + "num_tokens": 211482365.0, + "step": 5546 + }, + { + "epoch": 0.7056354153415596, + "grad_norm": 1.5351495742797852, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8724228143692017, + "num_tokens": 211515877.0, + "step": 5547 + }, + { + "epoch": 0.7057626256201501, + "grad_norm": 1.4634618759155273, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8605666160583496, + "num_tokens": 211557483.0, + "step": 5548 + }, + { + "epoch": 0.7058898358987407, + "grad_norm": 1.501037836074829, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8642578125, + "num_tokens": 211594752.0, + "step": 5549 + }, + { + "epoch": 0.7060170461773311, + "grad_norm": 1.4290854930877686, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8686012029647827, + "num_tokens": 211636654.0, + "step": 5550 + }, + { + "epoch": 0.7061442564559216, + "grad_norm": 1.4924975633621216, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8634170293807983, + "num_tokens": 211676344.0, + "step": 5551 + }, + { + "epoch": 0.7062714667345121, + "grad_norm": 1.5710610151290894, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8522208333015442, + "num_tokens": 211712700.0, + "step": 5552 + }, + { + "epoch": 0.7063986770131027, + "grad_norm": 1.5657402276992798, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8686650395393372, + "num_tokens": 211747296.0, + "step": 5553 + }, + { + "epoch": 0.7065258872916932, + "grad_norm": 1.3949676752090454, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8667218685150146, + "num_tokens": 211787010.0, + "step": 5554 + }, + { + "epoch": 0.7066530975702837, + "grad_norm": 1.686785340309143, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8587989807128906, + "num_tokens": 211821604.0, + "step": 5555 + }, + { + "epoch": 0.7067803078488741, + "grad_norm": 1.5765050649642944, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8736900091171265, + "num_tokens": 211855311.0, + "step": 5556 + }, + { + "epoch": 0.7069075181274647, + "grad_norm": 1.653766393661499, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8710160851478577, + "num_tokens": 211890530.0, + "step": 5557 + }, + { + "epoch": 0.7070347284060552, + "grad_norm": 1.5135177373886108, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8611428141593933, + "num_tokens": 211928580.0, + "step": 5558 + }, + { + "epoch": 0.7071619386846457, + "grad_norm": 1.4389439821243286, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8681217432022095, + "num_tokens": 211966172.0, + "step": 5559 + }, + { + "epoch": 0.7072891489632362, + "grad_norm": 1.486014723777771, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8624864220619202, + "num_tokens": 212004915.0, + "step": 5560 + }, + { + "epoch": 0.7074163592418268, + "grad_norm": 1.4848976135253906, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8667932748794556, + "num_tokens": 212044576.0, + "step": 5561 + }, + { + "epoch": 0.7075435695204173, + "grad_norm": 1.481415033340454, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8744916319847107, + "num_tokens": 212081409.0, + "step": 5562 + }, + { + "epoch": 0.7076707797990077, + "grad_norm": 1.3642257452011108, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8810597658157349, + "num_tokens": 212119690.0, + "step": 5563 + }, + { + "epoch": 0.7077979900775982, + "grad_norm": 1.5636451244354248, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8682270050048828, + "num_tokens": 212155289.0, + "step": 5564 + }, + { + "epoch": 0.7079252003561888, + "grad_norm": 1.5319215059280396, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8641338348388672, + "num_tokens": 212189885.0, + "step": 5565 + }, + { + "epoch": 0.7080524106347793, + "grad_norm": 1.5333462953567505, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8627605438232422, + "num_tokens": 212225796.0, + "step": 5566 + }, + { + "epoch": 0.7081796209133698, + "grad_norm": 1.5524756908416748, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8537697792053223, + "num_tokens": 212262840.0, + "step": 5567 + }, + { + "epoch": 0.7083068311919604, + "grad_norm": 1.5660136938095093, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8671908378601074, + "num_tokens": 212296082.0, + "step": 5568 + }, + { + "epoch": 0.7084340414705508, + "grad_norm": 1.5881577730178833, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8482229709625244, + "num_tokens": 212331938.0, + "step": 5569 + }, + { + "epoch": 0.7085612517491413, + "grad_norm": 1.5510873794555664, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8662359118461609, + "num_tokens": 212369237.0, + "step": 5570 + }, + { + "epoch": 0.7086884620277318, + "grad_norm": 1.4325189590454102, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.876822829246521, + "num_tokens": 212404228.0, + "step": 5571 + }, + { + "epoch": 0.7088156723063224, + "grad_norm": 1.4667863845825195, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8759627342224121, + "num_tokens": 212442182.0, + "step": 5572 + }, + { + "epoch": 0.7089428825849129, + "grad_norm": 1.6446114778518677, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8665152192115784, + "num_tokens": 212473500.0, + "step": 5573 + }, + { + "epoch": 0.7090700928635034, + "grad_norm": 1.6602479219436646, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8447808623313904, + "num_tokens": 212509393.0, + "step": 5574 + }, + { + "epoch": 0.7091973031420938, + "grad_norm": 1.4461660385131836, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8652166724205017, + "num_tokens": 212547205.0, + "step": 5575 + }, + { + "epoch": 0.7093245134206844, + "grad_norm": 1.5503700971603394, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.861809253692627, + "num_tokens": 212583929.0, + "step": 5576 + }, + { + "epoch": 0.7094517236992749, + "grad_norm": 1.5269157886505127, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8624285459518433, + "num_tokens": 212620769.0, + "step": 5577 + }, + { + "epoch": 0.7095789339778654, + "grad_norm": 1.4848260879516602, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.84953373670578, + "num_tokens": 212661591.0, + "step": 5578 + }, + { + "epoch": 0.709706144256456, + "grad_norm": 1.5168757438659668, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8533620834350586, + "num_tokens": 212700272.0, + "step": 5579 + }, + { + "epoch": 0.7098333545350465, + "grad_norm": 1.4476637840270996, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8644481301307678, + "num_tokens": 212744895.0, + "step": 5580 + }, + { + "epoch": 0.7099605648136369, + "grad_norm": 1.475781798362732, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8765129446983337, + "num_tokens": 212781639.0, + "step": 5581 + }, + { + "epoch": 0.7100877750922274, + "grad_norm": 1.6932097673416138, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8495533466339111, + "num_tokens": 212816850.0, + "step": 5582 + }, + { + "epoch": 0.710214985370818, + "grad_norm": 1.5080665349960327, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.868576169013977, + "num_tokens": 212852715.0, + "step": 5583 + }, + { + "epoch": 0.7103421956494085, + "grad_norm": 1.3578341007232666, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8706332445144653, + "num_tokens": 212894231.0, + "step": 5584 + }, + { + "epoch": 0.710469405927999, + "grad_norm": 1.4908373355865479, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8740471601486206, + "num_tokens": 212929678.0, + "step": 5585 + }, + { + "epoch": 0.7105966162065895, + "grad_norm": 1.5098458528518677, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8693932294845581, + "num_tokens": 212969165.0, + "step": 5586 + }, + { + "epoch": 0.71072382648518, + "grad_norm": 1.4522864818572998, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8700575828552246, + "num_tokens": 213009523.0, + "step": 5587 + }, + { + "epoch": 0.7108510367637705, + "grad_norm": 1.4509217739105225, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8710153102874756, + "num_tokens": 213050138.0, + "step": 5588 + }, + { + "epoch": 0.710978247042361, + "grad_norm": 1.394803524017334, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.879988431930542, + "num_tokens": 213090200.0, + "step": 5589 + }, + { + "epoch": 0.7111054573209515, + "grad_norm": 1.530667781829834, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8576109409332275, + "num_tokens": 213127845.0, + "step": 5590 + }, + { + "epoch": 0.7112326675995421, + "grad_norm": 1.5146851539611816, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.838106095790863, + "num_tokens": 213168420.0, + "step": 5591 + }, + { + "epoch": 0.7113598778781326, + "grad_norm": 1.442044734954834, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8714644908905029, + "num_tokens": 213207803.0, + "step": 5592 + }, + { + "epoch": 0.711487088156723, + "grad_norm": 1.3908227682113647, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8652881979942322, + "num_tokens": 213250020.0, + "step": 5593 + }, + { + "epoch": 0.7116142984353135, + "grad_norm": 1.5175261497497559, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8615010976791382, + "num_tokens": 213288090.0, + "step": 5594 + }, + { + "epoch": 0.7117415087139041, + "grad_norm": 1.5645325183868408, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8697134256362915, + "num_tokens": 213325548.0, + "step": 5595 + }, + { + "epoch": 0.7118687189924946, + "grad_norm": 1.7349612712860107, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8578360080718994, + "num_tokens": 213355479.0, + "step": 5596 + }, + { + "epoch": 0.7119959292710851, + "grad_norm": 1.4963802099227905, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8525815010070801, + "num_tokens": 213397224.0, + "step": 5597 + }, + { + "epoch": 0.7121231395496757, + "grad_norm": 1.6137136220932007, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8637515306472778, + "num_tokens": 213434460.0, + "step": 5598 + }, + { + "epoch": 0.7122503498282661, + "grad_norm": 1.6346663236618042, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8632771968841553, + "num_tokens": 213471750.0, + "step": 5599 + }, + { + "epoch": 0.7123775601068566, + "grad_norm": 1.4652411937713623, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8706003427505493, + "num_tokens": 213508870.0, + "step": 5600 + }, + { + "epoch": 0.7125047703854471, + "grad_norm": 1.460739016532898, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8603118658065796, + "num_tokens": 213552345.0, + "step": 5601 + }, + { + "epoch": 0.7126319806640377, + "grad_norm": 1.8033515214920044, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8572046756744385, + "num_tokens": 213584939.0, + "step": 5602 + }, + { + "epoch": 0.7127591909426282, + "grad_norm": 1.4238150119781494, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8584017753601074, + "num_tokens": 213627538.0, + "step": 5603 + }, + { + "epoch": 0.7128864012212187, + "grad_norm": 1.5758148431777954, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8604844808578491, + "num_tokens": 213669977.0, + "step": 5604 + }, + { + "epoch": 0.7130136114998091, + "grad_norm": 1.5688680410385132, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.870134711265564, + "num_tokens": 213708807.0, + "step": 5605 + }, + { + "epoch": 0.7131408217783997, + "grad_norm": 1.5249525308609009, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8576124310493469, + "num_tokens": 213747475.0, + "step": 5606 + }, + { + "epoch": 0.7132680320569902, + "grad_norm": 1.6049522161483765, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8631159067153931, + "num_tokens": 213785118.0, + "step": 5607 + }, + { + "epoch": 0.7133952423355807, + "grad_norm": 1.430164098739624, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8651105165481567, + "num_tokens": 213827466.0, + "step": 5608 + }, + { + "epoch": 0.7135224526141712, + "grad_norm": 1.4569615125656128, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8626381158828735, + "num_tokens": 213868975.0, + "step": 5609 + }, + { + "epoch": 0.7136496628927618, + "grad_norm": 1.3203085660934448, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8704725503921509, + "num_tokens": 213912756.0, + "step": 5610 + }, + { + "epoch": 0.7137768731713523, + "grad_norm": 1.4884183406829834, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.856308102607727, + "num_tokens": 213951862.0, + "step": 5611 + }, + { + "epoch": 0.7139040834499427, + "grad_norm": 1.3728915452957153, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8726891279220581, + "num_tokens": 213992674.0, + "step": 5612 + }, + { + "epoch": 0.7140312937285332, + "grad_norm": 1.5615659952163696, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8681346774101257, + "num_tokens": 214027797.0, + "step": 5613 + }, + { + "epoch": 0.7141585040071238, + "grad_norm": 1.5068180561065674, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8547499775886536, + "num_tokens": 214065305.0, + "step": 5614 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.590944766998291, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8531492948532104, + "num_tokens": 214099841.0, + "step": 5615 + }, + { + "epoch": 0.7144129245643048, + "grad_norm": 1.468308687210083, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8609387874603271, + "num_tokens": 214141962.0, + "step": 5616 + }, + { + "epoch": 0.7145401348428954, + "grad_norm": 1.5661498308181763, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.868621289730072, + "num_tokens": 214176471.0, + "step": 5617 + }, + { + "epoch": 0.7146673451214858, + "grad_norm": 1.590666651725769, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8533250093460083, + "num_tokens": 214216349.0, + "step": 5618 + }, + { + "epoch": 0.7147945554000763, + "grad_norm": 1.5721030235290527, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8645882606506348, + "num_tokens": 214254597.0, + "step": 5619 + }, + { + "epoch": 0.7149217656786668, + "grad_norm": 1.5304510593414307, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8744649291038513, + "num_tokens": 214290250.0, + "step": 5620 + }, + { + "epoch": 0.7150489759572574, + "grad_norm": 1.5908228158950806, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8593779802322388, + "num_tokens": 214323683.0, + "step": 5621 + }, + { + "epoch": 0.7151761862358479, + "grad_norm": 1.4563004970550537, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8656663298606873, + "num_tokens": 214363393.0, + "step": 5622 + }, + { + "epoch": 0.7153033965144384, + "grad_norm": 1.4075090885162354, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8596346378326416, + "num_tokens": 214406433.0, + "step": 5623 + }, + { + "epoch": 0.7154306067930288, + "grad_norm": 1.6542104482650757, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8603196144104004, + "num_tokens": 214437110.0, + "step": 5624 + }, + { + "epoch": 0.7155578170716194, + "grad_norm": 1.437154769897461, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8724169731140137, + "num_tokens": 214474665.0, + "step": 5625 + }, + { + "epoch": 0.7156850273502099, + "grad_norm": 1.5026090145111084, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8498325347900391, + "num_tokens": 214517593.0, + "step": 5626 + }, + { + "epoch": 0.7158122376288004, + "grad_norm": 1.4577540159225464, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.874039888381958, + "num_tokens": 214560342.0, + "step": 5627 + }, + { + "epoch": 0.715939447907391, + "grad_norm": 1.469448208808899, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8661761283874512, + "num_tokens": 214599846.0, + "step": 5628 + }, + { + "epoch": 0.7160666581859815, + "grad_norm": 1.4861056804656982, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8540854454040527, + "num_tokens": 214640098.0, + "step": 5629 + }, + { + "epoch": 0.7161938684645719, + "grad_norm": 1.4476103782653809, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8638449907302856, + "num_tokens": 214679542.0, + "step": 5630 + }, + { + "epoch": 0.7163210787431624, + "grad_norm": 1.8992244005203247, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8606456518173218, + "num_tokens": 214718974.0, + "step": 5631 + }, + { + "epoch": 0.716448289021753, + "grad_norm": 1.4870970249176025, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8675174117088318, + "num_tokens": 214757147.0, + "step": 5632 + }, + { + "epoch": 0.7165754993003435, + "grad_norm": 1.4482492208480835, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8684970140457153, + "num_tokens": 214796978.0, + "step": 5633 + }, + { + "epoch": 0.716702709578934, + "grad_norm": 1.4857901334762573, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8735008239746094, + "num_tokens": 214833489.0, + "step": 5634 + }, + { + "epoch": 0.7168299198575245, + "grad_norm": 1.3898581266403198, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8586366176605225, + "num_tokens": 214880001.0, + "step": 5635 + }, + { + "epoch": 0.716957130136115, + "grad_norm": 1.4324960708618164, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8749500513076782, + "num_tokens": 214920439.0, + "step": 5636 + }, + { + "epoch": 0.7170843404147055, + "grad_norm": 1.4902571439743042, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.868706464767456, + "num_tokens": 214957722.0, + "step": 5637 + }, + { + "epoch": 0.717211550693296, + "grad_norm": 1.6001100540161133, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8540823459625244, + "num_tokens": 214994070.0, + "step": 5638 + }, + { + "epoch": 0.7173387609718865, + "grad_norm": 1.462393045425415, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8768572807312012, + "num_tokens": 215029023.0, + "step": 5639 + }, + { + "epoch": 0.7174659712504771, + "grad_norm": 1.4733046293258667, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8593413829803467, + "num_tokens": 215070210.0, + "step": 5640 + }, + { + "epoch": 0.7175931815290676, + "grad_norm": 1.5669937133789062, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8662811517715454, + "num_tokens": 215105961.0, + "step": 5641 + }, + { + "epoch": 0.717720391807658, + "grad_norm": 1.3768552541732788, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8575940132141113, + "num_tokens": 215149157.0, + "step": 5642 + }, + { + "epoch": 0.7178476020862485, + "grad_norm": 1.5033434629440308, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8702037334442139, + "num_tokens": 215185993.0, + "step": 5643 + }, + { + "epoch": 0.7179748123648391, + "grad_norm": 1.4550716876983643, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8639689087867737, + "num_tokens": 215226261.0, + "step": 5644 + }, + { + "epoch": 0.7181020226434296, + "grad_norm": 1.605760097503662, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.854114294052124, + "num_tokens": 215260528.0, + "step": 5645 + }, + { + "epoch": 0.7182292329220201, + "grad_norm": 1.4595755338668823, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8682472705841064, + "num_tokens": 215298107.0, + "step": 5646 + }, + { + "epoch": 0.7183564432006107, + "grad_norm": 1.4799314737319946, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8587676286697388, + "num_tokens": 215339026.0, + "step": 5647 + }, + { + "epoch": 0.7184836534792011, + "grad_norm": 1.678873062133789, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8413848876953125, + "num_tokens": 215375044.0, + "step": 5648 + }, + { + "epoch": 0.7186108637577916, + "grad_norm": 1.4596168994903564, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8708533048629761, + "num_tokens": 215413039.0, + "step": 5649 + }, + { + "epoch": 0.7187380740363821, + "grad_norm": 1.5280907154083252, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8658279180526733, + "num_tokens": 215448927.0, + "step": 5650 + }, + { + "epoch": 0.7188652843149727, + "grad_norm": 1.651615858078003, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8770540952682495, + "num_tokens": 215479413.0, + "step": 5651 + }, + { + "epoch": 0.7189924945935632, + "grad_norm": 1.7169089317321777, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8742167949676514, + "num_tokens": 215509739.0, + "step": 5652 + }, + { + "epoch": 0.7191197048721537, + "grad_norm": 1.6250708103179932, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8662257194519043, + "num_tokens": 215543438.0, + "step": 5653 + }, + { + "epoch": 0.7192469151507441, + "grad_norm": 1.497779130935669, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8615473508834839, + "num_tokens": 215579608.0, + "step": 5654 + }, + { + "epoch": 0.7193741254293347, + "grad_norm": 1.6182397603988647, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8848837018013, + "num_tokens": 215612608.0, + "step": 5655 + }, + { + "epoch": 0.7195013357079252, + "grad_norm": 1.5170178413391113, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8650901317596436, + "num_tokens": 215652497.0, + "step": 5656 + }, + { + "epoch": 0.7196285459865157, + "grad_norm": 1.568336009979248, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8657281398773193, + "num_tokens": 215686821.0, + "step": 5657 + }, + { + "epoch": 0.7197557562651062, + "grad_norm": 1.4172425270080566, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8679964542388916, + "num_tokens": 215728615.0, + "step": 5658 + }, + { + "epoch": 0.7198829665436968, + "grad_norm": 1.4303839206695557, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8683573603630066, + "num_tokens": 215767911.0, + "step": 5659 + }, + { + "epoch": 0.7200101768222873, + "grad_norm": 1.3466001749038696, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8848786354064941, + "num_tokens": 215810717.0, + "step": 5660 + }, + { + "epoch": 0.7201373871008777, + "grad_norm": 1.5314109325408936, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8620105385780334, + "num_tokens": 215851515.0, + "step": 5661 + }, + { + "epoch": 0.7202645973794682, + "grad_norm": 1.6174299716949463, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8587219715118408, + "num_tokens": 215888425.0, + "step": 5662 + }, + { + "epoch": 0.7203918076580588, + "grad_norm": 1.523382544517517, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8577262163162231, + "num_tokens": 215925442.0, + "step": 5663 + }, + { + "epoch": 0.7205190179366493, + "grad_norm": 1.5405508279800415, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8722213506698608, + "num_tokens": 215958139.0, + "step": 5664 + }, + { + "epoch": 0.7206462282152398, + "grad_norm": 1.4822826385498047, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8681800961494446, + "num_tokens": 215996153.0, + "step": 5665 + }, + { + "epoch": 0.7207734384938304, + "grad_norm": 1.5930861234664917, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.843436598777771, + "num_tokens": 216035882.0, + "step": 5666 + }, + { + "epoch": 0.7209006487724208, + "grad_norm": 1.3934831619262695, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8733288645744324, + "num_tokens": 216075554.0, + "step": 5667 + }, + { + "epoch": 0.7210278590510113, + "grad_norm": 1.4602280855178833, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8676348328590393, + "num_tokens": 216111300.0, + "step": 5668 + }, + { + "epoch": 0.7211550693296018, + "grad_norm": 1.4730252027511597, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8727021217346191, + "num_tokens": 216150374.0, + "step": 5669 + }, + { + "epoch": 0.7212822796081924, + "grad_norm": 1.6108182668685913, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8594878315925598, + "num_tokens": 216184560.0, + "step": 5670 + }, + { + "epoch": 0.7214094898867829, + "grad_norm": 1.4554165601730347, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8876042366027832, + "num_tokens": 216220741.0, + "step": 5671 + }, + { + "epoch": 0.7215367001653734, + "grad_norm": 1.542612075805664, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8670822978019714, + "num_tokens": 216257685.0, + "step": 5672 + }, + { + "epoch": 0.7216639104439638, + "grad_norm": 1.4526134729385376, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8679185509681702, + "num_tokens": 216294331.0, + "step": 5673 + }, + { + "epoch": 0.7217911207225544, + "grad_norm": 1.4603129625320435, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8661841750144958, + "num_tokens": 216334085.0, + "step": 5674 + }, + { + "epoch": 0.7219183310011449, + "grad_norm": 1.448111653327942, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8687376976013184, + "num_tokens": 216372356.0, + "step": 5675 + }, + { + "epoch": 0.7220455412797354, + "grad_norm": 1.4893722534179688, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8604015111923218, + "num_tokens": 216412685.0, + "step": 5676 + }, + { + "epoch": 0.7221727515583259, + "grad_norm": 1.5281025171279907, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8785241842269897, + "num_tokens": 216448171.0, + "step": 5677 + }, + { + "epoch": 0.7222999618369165, + "grad_norm": 1.366683006286621, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8908200263977051, + "num_tokens": 216488299.0, + "step": 5678 + }, + { + "epoch": 0.7224271721155069, + "grad_norm": 1.5283880233764648, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.852229118347168, + "num_tokens": 216527894.0, + "step": 5679 + }, + { + "epoch": 0.7225543823940974, + "grad_norm": 1.4739229679107666, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.874565601348877, + "num_tokens": 216563903.0, + "step": 5680 + }, + { + "epoch": 0.722681592672688, + "grad_norm": 1.4795432090759277, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8633581399917603, + "num_tokens": 216601263.0, + "step": 5681 + }, + { + "epoch": 0.7228088029512785, + "grad_norm": 1.5587334632873535, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8522539734840393, + "num_tokens": 216640363.0, + "step": 5682 + }, + { + "epoch": 0.722936013229869, + "grad_norm": 1.4848556518554688, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8717949390411377, + "num_tokens": 216677543.0, + "step": 5683 + }, + { + "epoch": 0.7230632235084595, + "grad_norm": 1.4837164878845215, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8495484590530396, + "num_tokens": 216718893.0, + "step": 5684 + }, + { + "epoch": 0.72319043378705, + "grad_norm": 1.5470679998397827, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8649004101753235, + "num_tokens": 216755290.0, + "step": 5685 + }, + { + "epoch": 0.7233176440656405, + "grad_norm": 1.6107158660888672, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8623763918876648, + "num_tokens": 216792401.0, + "step": 5686 + }, + { + "epoch": 0.723444854344231, + "grad_norm": 1.5823581218719482, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8607228994369507, + "num_tokens": 216825198.0, + "step": 5687 + }, + { + "epoch": 0.7235720646228215, + "grad_norm": 1.4386900663375854, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8613749742507935, + "num_tokens": 216864312.0, + "step": 5688 + }, + { + "epoch": 0.7236992749014121, + "grad_norm": 1.385571837425232, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8717947602272034, + "num_tokens": 216907152.0, + "step": 5689 + }, + { + "epoch": 0.7238264851800026, + "grad_norm": 1.5339720249176025, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8801373839378357, + "num_tokens": 216942406.0, + "step": 5690 + }, + { + "epoch": 0.723953695458593, + "grad_norm": 1.5110164880752563, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8667377233505249, + "num_tokens": 216981306.0, + "step": 5691 + }, + { + "epoch": 0.7240809057371835, + "grad_norm": 1.4672603607177734, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8547903299331665, + "num_tokens": 217021382.0, + "step": 5692 + }, + { + "epoch": 0.7242081160157741, + "grad_norm": 1.5284602642059326, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8635730147361755, + "num_tokens": 217058342.0, + "step": 5693 + }, + { + "epoch": 0.7243353262943646, + "grad_norm": 1.3987514972686768, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8609446883201599, + "num_tokens": 217102741.0, + "step": 5694 + }, + { + "epoch": 0.7244625365729551, + "grad_norm": 1.582743763923645, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8623150587081909, + "num_tokens": 217138178.0, + "step": 5695 + }, + { + "epoch": 0.7245897468515456, + "grad_norm": 1.4472787380218506, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8703716397285461, + "num_tokens": 217181428.0, + "step": 5696 + }, + { + "epoch": 0.7247169571301361, + "grad_norm": 1.4215582609176636, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8696470260620117, + "num_tokens": 217221278.0, + "step": 5697 + }, + { + "epoch": 0.7248441674087266, + "grad_norm": 1.3921669721603394, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8716295957565308, + "num_tokens": 217261954.0, + "step": 5698 + }, + { + "epoch": 0.7249713776873171, + "grad_norm": 1.496588945388794, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8594189882278442, + "num_tokens": 217301555.0, + "step": 5699 + }, + { + "epoch": 0.7250985879659076, + "grad_norm": 1.6106312274932861, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8498834371566772, + "num_tokens": 217339446.0, + "step": 5700 + }, + { + "epoch": 0.7252257982444982, + "grad_norm": 1.6427578926086426, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8500895500183105, + "num_tokens": 217374137.0, + "step": 5701 + }, + { + "epoch": 0.7253530085230887, + "grad_norm": 1.475382685661316, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8520885705947876, + "num_tokens": 217414576.0, + "step": 5702 + }, + { + "epoch": 0.7254802188016791, + "grad_norm": 1.479803204536438, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8702971339225769, + "num_tokens": 217454724.0, + "step": 5703 + }, + { + "epoch": 0.7256074290802697, + "grad_norm": 1.534443974494934, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8497529625892639, + "num_tokens": 217490567.0, + "step": 5704 + }, + { + "epoch": 0.7257346393588602, + "grad_norm": 1.3818116188049316, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8606444597244263, + "num_tokens": 217535568.0, + "step": 5705 + }, + { + "epoch": 0.7258618496374507, + "grad_norm": 1.436193823814392, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8605628609657288, + "num_tokens": 217575425.0, + "step": 5706 + }, + { + "epoch": 0.7259890599160412, + "grad_norm": 1.5439561605453491, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.87279212474823, + "num_tokens": 217608469.0, + "step": 5707 + }, + { + "epoch": 0.7261162701946318, + "grad_norm": 1.4646865129470825, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8574013710021973, + "num_tokens": 217652775.0, + "step": 5708 + }, + { + "epoch": 0.7262434804732223, + "grad_norm": 1.4516661167144775, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8616198897361755, + "num_tokens": 217693482.0, + "step": 5709 + }, + { + "epoch": 0.7263706907518127, + "grad_norm": 1.6031068563461304, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8770278692245483, + "num_tokens": 217725720.0, + "step": 5710 + }, + { + "epoch": 0.7264979010304032, + "grad_norm": 1.5921944379806519, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8655206561088562, + "num_tokens": 217761261.0, + "step": 5711 + }, + { + "epoch": 0.7266251113089938, + "grad_norm": 1.452062964439392, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8560834527015686, + "num_tokens": 217802781.0, + "step": 5712 + }, + { + "epoch": 0.7267523215875843, + "grad_norm": 1.4298471212387085, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.870512068271637, + "num_tokens": 217843681.0, + "step": 5713 + }, + { + "epoch": 0.7268795318661748, + "grad_norm": 1.4856371879577637, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8652305603027344, + "num_tokens": 217884141.0, + "step": 5714 + }, + { + "epoch": 0.7270067421447653, + "grad_norm": 1.615824580192566, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8632084131240845, + "num_tokens": 217917916.0, + "step": 5715 + }, + { + "epoch": 0.7271339524233558, + "grad_norm": 1.7522399425506592, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8575604557991028, + "num_tokens": 217948539.0, + "step": 5716 + }, + { + "epoch": 0.7272611627019463, + "grad_norm": 1.5502073764801025, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8572063446044922, + "num_tokens": 217988314.0, + "step": 5717 + }, + { + "epoch": 0.7273883729805368, + "grad_norm": 1.5934466123580933, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8554297685623169, + "num_tokens": 218027202.0, + "step": 5718 + }, + { + "epoch": 0.7275155832591274, + "grad_norm": 1.4373162984848022, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.877476692199707, + "num_tokens": 218064808.0, + "step": 5719 + }, + { + "epoch": 0.7276427935377179, + "grad_norm": 1.4438520669937134, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8541908264160156, + "num_tokens": 218107525.0, + "step": 5720 + }, + { + "epoch": 0.7277700038163084, + "grad_norm": 1.477052927017212, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8681290745735168, + "num_tokens": 218146574.0, + "step": 5721 + }, + { + "epoch": 0.7278972140948988, + "grad_norm": 1.5084481239318848, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.861177921295166, + "num_tokens": 218183970.0, + "step": 5722 + }, + { + "epoch": 0.7280244243734894, + "grad_norm": 1.5344655513763428, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8726084232330322, + "num_tokens": 218216542.0, + "step": 5723 + }, + { + "epoch": 0.7281516346520799, + "grad_norm": 1.420953392982483, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8787931203842163, + "num_tokens": 218260573.0, + "step": 5724 + }, + { + "epoch": 0.7282788449306704, + "grad_norm": 1.526814341545105, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8713726997375488, + "num_tokens": 218299887.0, + "step": 5725 + }, + { + "epoch": 0.7284060552092609, + "grad_norm": 1.5664629936218262, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8488240242004395, + "num_tokens": 218335617.0, + "step": 5726 + }, + { + "epoch": 0.7285332654878515, + "grad_norm": 1.5377171039581299, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8564252853393555, + "num_tokens": 218371762.0, + "step": 5727 + }, + { + "epoch": 0.7286604757664419, + "grad_norm": 1.518517255783081, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8567575812339783, + "num_tokens": 218412841.0, + "step": 5728 + }, + { + "epoch": 0.7287876860450324, + "grad_norm": 1.694246530532837, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8759410977363586, + "num_tokens": 218450234.0, + "step": 5729 + }, + { + "epoch": 0.7289148963236229, + "grad_norm": 1.8257226943969727, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8614431619644165, + "num_tokens": 218482198.0, + "step": 5730 + }, + { + "epoch": 0.7290421066022135, + "grad_norm": 1.525172472000122, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8632781505584717, + "num_tokens": 218522153.0, + "step": 5731 + }, + { + "epoch": 0.729169316880804, + "grad_norm": 1.5426814556121826, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8466265201568604, + "num_tokens": 218561128.0, + "step": 5732 + }, + { + "epoch": 0.7292965271593945, + "grad_norm": 1.5037800073623657, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8600815534591675, + "num_tokens": 218600169.0, + "step": 5733 + }, + { + "epoch": 0.7294237374379849, + "grad_norm": 1.4506727457046509, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8813753128051758, + "num_tokens": 218640737.0, + "step": 5734 + }, + { + "epoch": 0.7295509477165755, + "grad_norm": 1.7746946811676025, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8589792251586914, + "num_tokens": 218676999.0, + "step": 5735 + }, + { + "epoch": 0.729678157995166, + "grad_norm": 1.4084728956222534, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8656338453292847, + "num_tokens": 218721324.0, + "step": 5736 + }, + { + "epoch": 0.7298053682737565, + "grad_norm": 1.3624145984649658, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8748009204864502, + "num_tokens": 218763148.0, + "step": 5737 + }, + { + "epoch": 0.7299325785523471, + "grad_norm": 1.4979387521743774, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8725911378860474, + "num_tokens": 218798222.0, + "step": 5738 + }, + { + "epoch": 0.7300597888309376, + "grad_norm": 1.6290981769561768, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8711448311805725, + "num_tokens": 218831597.0, + "step": 5739 + }, + { + "epoch": 0.730186999109528, + "grad_norm": 1.4699426889419556, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8719931840896606, + "num_tokens": 218869872.0, + "step": 5740 + }, + { + "epoch": 0.7303142093881185, + "grad_norm": 1.5439989566802979, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8651193976402283, + "num_tokens": 218906717.0, + "step": 5741 + }, + { + "epoch": 0.7304414196667091, + "grad_norm": 1.4384733438491821, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8583264350891113, + "num_tokens": 218952199.0, + "step": 5742 + }, + { + "epoch": 0.7305686299452996, + "grad_norm": 1.3453956842422485, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8591222763061523, + "num_tokens": 218999650.0, + "step": 5743 + }, + { + "epoch": 0.7306958402238901, + "grad_norm": 1.570820927619934, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8526596426963806, + "num_tokens": 219039398.0, + "step": 5744 + }, + { + "epoch": 0.7308230505024806, + "grad_norm": 1.4886143207550049, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8676460981369019, + "num_tokens": 219080152.0, + "step": 5745 + }, + { + "epoch": 0.7309502607810711, + "grad_norm": 1.3125249147415161, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8680744171142578, + "num_tokens": 219127290.0, + "step": 5746 + }, + { + "epoch": 0.7310774710596616, + "grad_norm": 1.6854252815246582, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8632560968399048, + "num_tokens": 219159056.0, + "step": 5747 + }, + { + "epoch": 0.7312046813382521, + "grad_norm": 1.4765493869781494, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8512718677520752, + "num_tokens": 219202280.0, + "step": 5748 + }, + { + "epoch": 0.7313318916168426, + "grad_norm": 1.496472954750061, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8686779141426086, + "num_tokens": 219240230.0, + "step": 5749 + }, + { + "epoch": 0.7314591018954332, + "grad_norm": 1.4573036432266235, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8746984601020813, + "num_tokens": 219279145.0, + "step": 5750 + }, + { + "epoch": 0.7315863121740237, + "grad_norm": 1.6006100177764893, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8808264136314392, + "num_tokens": 219314705.0, + "step": 5751 + }, + { + "epoch": 0.7317135224526141, + "grad_norm": 1.3925749063491821, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8717600107192993, + "num_tokens": 219359702.0, + "step": 5752 + }, + { + "epoch": 0.7318407327312046, + "grad_norm": 1.474644660949707, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8699901103973389, + "num_tokens": 219396309.0, + "step": 5753 + }, + { + "epoch": 0.7319679430097952, + "grad_norm": 1.4938088655471802, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8596489429473877, + "num_tokens": 219432775.0, + "step": 5754 + }, + { + "epoch": 0.7320951532883857, + "grad_norm": 1.4912408590316772, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8631108999252319, + "num_tokens": 219472331.0, + "step": 5755 + }, + { + "epoch": 0.7322223635669762, + "grad_norm": 1.574849247932434, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8644422292709351, + "num_tokens": 219506175.0, + "step": 5756 + }, + { + "epoch": 0.7323495738455668, + "grad_norm": 1.4051451683044434, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8735102415084839, + "num_tokens": 219546607.0, + "step": 5757 + }, + { + "epoch": 0.7324767841241572, + "grad_norm": 1.4429970979690552, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8656426668167114, + "num_tokens": 219587159.0, + "step": 5758 + }, + { + "epoch": 0.7326039944027477, + "grad_norm": 1.3770244121551514, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8652170896530151, + "num_tokens": 219632597.0, + "step": 5759 + }, + { + "epoch": 0.7327312046813382, + "grad_norm": 1.5955822467803955, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8696052432060242, + "num_tokens": 219663837.0, + "step": 5760 + }, + { + "epoch": 0.7328584149599288, + "grad_norm": 1.4323374032974243, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.857295036315918, + "num_tokens": 219706252.0, + "step": 5761 + }, + { + "epoch": 0.7329856252385193, + "grad_norm": 1.4181525707244873, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8670363426208496, + "num_tokens": 219747180.0, + "step": 5762 + }, + { + "epoch": 0.7331128355171098, + "grad_norm": 1.5606154203414917, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8614745736122131, + "num_tokens": 219786283.0, + "step": 5763 + }, + { + "epoch": 0.7332400457957003, + "grad_norm": 1.5279513597488403, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8731170892715454, + "num_tokens": 219821097.0, + "step": 5764 + }, + { + "epoch": 0.7333672560742908, + "grad_norm": 1.5449243783950806, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8469608426094055, + "num_tokens": 219861901.0, + "step": 5765 + }, + { + "epoch": 0.7334944663528813, + "grad_norm": 1.4346998929977417, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8626023530960083, + "num_tokens": 219903929.0, + "step": 5766 + }, + { + "epoch": 0.7336216766314718, + "grad_norm": 1.602513074874878, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.856289267539978, + "num_tokens": 219940992.0, + "step": 5767 + }, + { + "epoch": 0.7337488869100623, + "grad_norm": 1.6044833660125732, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8708125352859497, + "num_tokens": 219973930.0, + "step": 5768 + }, + { + "epoch": 0.7338760971886529, + "grad_norm": 1.6140179634094238, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8642295598983765, + "num_tokens": 220008156.0, + "step": 5769 + }, + { + "epoch": 0.7340033074672434, + "grad_norm": 1.4966105222702026, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8663407564163208, + "num_tokens": 220046238.0, + "step": 5770 + }, + { + "epoch": 0.7341305177458338, + "grad_norm": 1.549518346786499, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8645427823066711, + "num_tokens": 220088080.0, + "step": 5771 + }, + { + "epoch": 0.7342577280244243, + "grad_norm": 1.4681050777435303, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8741955757141113, + "num_tokens": 220127697.0, + "step": 5772 + }, + { + "epoch": 0.7343849383030149, + "grad_norm": 1.6986355781555176, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8681716918945312, + "num_tokens": 220161385.0, + "step": 5773 + }, + { + "epoch": 0.7345121485816054, + "grad_norm": 1.5410181283950806, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8616151213645935, + "num_tokens": 220200288.0, + "step": 5774 + }, + { + "epoch": 0.7346393588601959, + "grad_norm": 1.524753212928772, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8511600494384766, + "num_tokens": 220244327.0, + "step": 5775 + }, + { + "epoch": 0.7347665691387865, + "grad_norm": 1.5104128122329712, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8842073678970337, + "num_tokens": 220277477.0, + "step": 5776 + }, + { + "epoch": 0.7348937794173769, + "grad_norm": 1.5796537399291992, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8628665804862976, + "num_tokens": 220312377.0, + "step": 5777 + }, + { + "epoch": 0.7350209896959674, + "grad_norm": 1.581102728843689, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8685262203216553, + "num_tokens": 220345451.0, + "step": 5778 + }, + { + "epoch": 0.7351481999745579, + "grad_norm": 1.4806199073791504, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8704969882965088, + "num_tokens": 220389378.0, + "step": 5779 + }, + { + "epoch": 0.7352754102531485, + "grad_norm": 1.5862716436386108, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8535944223403931, + "num_tokens": 220429814.0, + "step": 5780 + }, + { + "epoch": 0.735402620531739, + "grad_norm": 1.6591726541519165, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8627450466156006, + "num_tokens": 220461881.0, + "step": 5781 + }, + { + "epoch": 0.7355298308103295, + "grad_norm": 1.581507682800293, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8519970178604126, + "num_tokens": 220498948.0, + "step": 5782 + }, + { + "epoch": 0.7356570410889199, + "grad_norm": 1.5207221508026123, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.848791778087616, + "num_tokens": 220536765.0, + "step": 5783 + }, + { + "epoch": 0.7357842513675105, + "grad_norm": 1.4746379852294922, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8566762208938599, + "num_tokens": 220580916.0, + "step": 5784 + }, + { + "epoch": 0.735911461646101, + "grad_norm": 1.5597838163375854, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8625288605690002, + "num_tokens": 220615148.0, + "step": 5785 + }, + { + "epoch": 0.7360386719246915, + "grad_norm": 1.5648165941238403, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.861276388168335, + "num_tokens": 220656471.0, + "step": 5786 + }, + { + "epoch": 0.736165882203282, + "grad_norm": 1.3927541971206665, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8564577102661133, + "num_tokens": 220698930.0, + "step": 5787 + }, + { + "epoch": 0.7362930924818726, + "grad_norm": 1.5852330923080444, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8665772080421448, + "num_tokens": 220733780.0, + "step": 5788 + }, + { + "epoch": 0.736420302760463, + "grad_norm": 1.4149527549743652, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8723050951957703, + "num_tokens": 220774631.0, + "step": 5789 + }, + { + "epoch": 0.7365475130390535, + "grad_norm": 1.3948935270309448, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8580533862113953, + "num_tokens": 220817969.0, + "step": 5790 + }, + { + "epoch": 0.736674723317644, + "grad_norm": 1.5270867347717285, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.851719856262207, + "num_tokens": 220856750.0, + "step": 5791 + }, + { + "epoch": 0.7368019335962346, + "grad_norm": 1.5692189931869507, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8578321933746338, + "num_tokens": 220892039.0, + "step": 5792 + }, + { + "epoch": 0.7369291438748251, + "grad_norm": 1.4618884325027466, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8778727054595947, + "num_tokens": 220928940.0, + "step": 5793 + }, + { + "epoch": 0.7370563541534156, + "grad_norm": 1.4873452186584473, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8455625772476196, + "num_tokens": 220970320.0, + "step": 5794 + }, + { + "epoch": 0.737183564432006, + "grad_norm": 1.5959683656692505, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8569212555885315, + "num_tokens": 221007207.0, + "step": 5795 + }, + { + "epoch": 0.7373107747105966, + "grad_norm": 1.5153934955596924, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8729743957519531, + "num_tokens": 221044105.0, + "step": 5796 + }, + { + "epoch": 0.7374379849891871, + "grad_norm": 1.4378762245178223, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8638042211532593, + "num_tokens": 221084226.0, + "step": 5797 + }, + { + "epoch": 0.7375651952677776, + "grad_norm": 1.388220191001892, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8579917550086975, + "num_tokens": 221125501.0, + "step": 5798 + }, + { + "epoch": 0.7376924055463682, + "grad_norm": 1.3995710611343384, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8659698367118835, + "num_tokens": 221167208.0, + "step": 5799 + }, + { + "epoch": 0.7378196158249587, + "grad_norm": 1.634547233581543, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.852850079536438, + "num_tokens": 221203005.0, + "step": 5800 + }, + { + "epoch": 0.7379468261035491, + "grad_norm": 1.3775006532669067, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.87638920545578, + "num_tokens": 221242624.0, + "step": 5801 + }, + { + "epoch": 0.7380740363821396, + "grad_norm": 1.453559398651123, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8626514673233032, + "num_tokens": 221281889.0, + "step": 5802 + }, + { + "epoch": 0.7382012466607302, + "grad_norm": 1.5143219232559204, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8616530895233154, + "num_tokens": 221321734.0, + "step": 5803 + }, + { + "epoch": 0.7383284569393207, + "grad_norm": 1.4701634645462036, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8774040937423706, + "num_tokens": 221363542.0, + "step": 5804 + }, + { + "epoch": 0.7384556672179112, + "grad_norm": 1.4487202167510986, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8700301647186279, + "num_tokens": 221403654.0, + "step": 5805 + }, + { + "epoch": 0.7385828774965018, + "grad_norm": 1.5066348314285278, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8827289342880249, + "num_tokens": 221441140.0, + "step": 5806 + }, + { + "epoch": 0.7387100877750922, + "grad_norm": 1.34951913356781, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8560817837715149, + "num_tokens": 221485826.0, + "step": 5807 + }, + { + "epoch": 0.7388372980536827, + "grad_norm": 1.5103423595428467, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8639441132545471, + "num_tokens": 221524556.0, + "step": 5808 + }, + { + "epoch": 0.7389645083322732, + "grad_norm": 1.45388925075531, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8745607733726501, + "num_tokens": 221560568.0, + "step": 5809 + }, + { + "epoch": 0.7390917186108638, + "grad_norm": 1.6157183647155762, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.854525625705719, + "num_tokens": 221599998.0, + "step": 5810 + }, + { + "epoch": 0.7392189288894543, + "grad_norm": 1.7072556018829346, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8520601987838745, + "num_tokens": 221636430.0, + "step": 5811 + }, + { + "epoch": 0.7393461391680448, + "grad_norm": 1.7981771230697632, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8590940833091736, + "num_tokens": 221665263.0, + "step": 5812 + }, + { + "epoch": 0.7394733494466353, + "grad_norm": 1.526287317276001, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.868073582649231, + "num_tokens": 221703266.0, + "step": 5813 + }, + { + "epoch": 0.7396005597252258, + "grad_norm": 1.441856861114502, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.876735270023346, + "num_tokens": 221741620.0, + "step": 5814 + }, + { + "epoch": 0.7397277700038163, + "grad_norm": 1.59408700466156, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8470357656478882, + "num_tokens": 221780329.0, + "step": 5815 + }, + { + "epoch": 0.7398549802824068, + "grad_norm": 1.499712586402893, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8603112697601318, + "num_tokens": 221821456.0, + "step": 5816 + }, + { + "epoch": 0.7399821905609973, + "grad_norm": 1.5132129192352295, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8542976379394531, + "num_tokens": 221860562.0, + "step": 5817 + }, + { + "epoch": 0.7401094008395879, + "grad_norm": 1.4572184085845947, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8660260438919067, + "num_tokens": 221896631.0, + "step": 5818 + }, + { + "epoch": 0.7402366111181784, + "grad_norm": 1.4242948293685913, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8631223440170288, + "num_tokens": 221939323.0, + "step": 5819 + }, + { + "epoch": 0.7403638213967688, + "grad_norm": 1.4480564594268799, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8780041337013245, + "num_tokens": 221975991.0, + "step": 5820 + }, + { + "epoch": 0.7404910316753593, + "grad_norm": 1.5479438304901123, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8836110234260559, + "num_tokens": 222008665.0, + "step": 5821 + }, + { + "epoch": 0.7406182419539499, + "grad_norm": 1.5177143812179565, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8648543953895569, + "num_tokens": 222045872.0, + "step": 5822 + }, + { + "epoch": 0.7407454522325404, + "grad_norm": 1.3112133741378784, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8714512586593628, + "num_tokens": 222084764.0, + "step": 5823 + }, + { + "epoch": 0.7408726625111309, + "grad_norm": 1.4899914264678955, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8750302195549011, + "num_tokens": 222121331.0, + "step": 5824 + }, + { + "epoch": 0.7409998727897215, + "grad_norm": 1.3661304712295532, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8526749014854431, + "num_tokens": 222167198.0, + "step": 5825 + }, + { + "epoch": 0.7411270830683119, + "grad_norm": 1.5287009477615356, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8613135814666748, + "num_tokens": 222201761.0, + "step": 5826 + }, + { + "epoch": 0.7412542933469024, + "grad_norm": 1.4271321296691895, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8628026247024536, + "num_tokens": 222243579.0, + "step": 5827 + }, + { + "epoch": 0.7413815036254929, + "grad_norm": 1.561779260635376, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8603804111480713, + "num_tokens": 222283868.0, + "step": 5828 + }, + { + "epoch": 0.7415087139040835, + "grad_norm": 1.4741555452346802, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8889889717102051, + "num_tokens": 222318300.0, + "step": 5829 + }, + { + "epoch": 0.741635924182674, + "grad_norm": 1.4366686344146729, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8688305616378784, + "num_tokens": 222355656.0, + "step": 5830 + }, + { + "epoch": 0.7417631344612645, + "grad_norm": 1.5894075632095337, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8607085943222046, + "num_tokens": 222389514.0, + "step": 5831 + }, + { + "epoch": 0.7418903447398549, + "grad_norm": 1.4609928131103516, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.858726441860199, + "num_tokens": 222430738.0, + "step": 5832 + }, + { + "epoch": 0.7420175550184455, + "grad_norm": 1.4548583030700684, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8753542900085449, + "num_tokens": 222472224.0, + "step": 5833 + }, + { + "epoch": 0.742144765297036, + "grad_norm": 1.5890642404556274, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8585230112075806, + "num_tokens": 222509674.0, + "step": 5834 + }, + { + "epoch": 0.7422719755756265, + "grad_norm": 1.5390568971633911, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8741923570632935, + "num_tokens": 222542521.0, + "step": 5835 + }, + { + "epoch": 0.742399185854217, + "grad_norm": 1.5086383819580078, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8661547899246216, + "num_tokens": 222578186.0, + "step": 5836 + }, + { + "epoch": 0.7425263961328076, + "grad_norm": 1.3272182941436768, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8782328367233276, + "num_tokens": 222623236.0, + "step": 5837 + }, + { + "epoch": 0.742653606411398, + "grad_norm": 1.6179293394088745, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8505932688713074, + "num_tokens": 222659295.0, + "step": 5838 + }, + { + "epoch": 0.7427808166899885, + "grad_norm": 1.556696891784668, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8690658807754517, + "num_tokens": 222695786.0, + "step": 5839 + }, + { + "epoch": 0.742908026968579, + "grad_norm": 1.6302289962768555, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8688204288482666, + "num_tokens": 222731897.0, + "step": 5840 + }, + { + "epoch": 0.7430352372471696, + "grad_norm": 1.5588278770446777, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8531988859176636, + "num_tokens": 222774292.0, + "step": 5841 + }, + { + "epoch": 0.7431624475257601, + "grad_norm": 1.378412127494812, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.876896321773529, + "num_tokens": 222815648.0, + "step": 5842 + }, + { + "epoch": 0.7432896578043506, + "grad_norm": 1.465505838394165, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8831411004066467, + "num_tokens": 222851118.0, + "step": 5843 + }, + { + "epoch": 0.743416868082941, + "grad_norm": 1.5593295097351074, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8620150089263916, + "num_tokens": 222887902.0, + "step": 5844 + }, + { + "epoch": 0.7435440783615316, + "grad_norm": 1.5106685161590576, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8741346597671509, + "num_tokens": 222924181.0, + "step": 5845 + }, + { + "epoch": 0.7436712886401221, + "grad_norm": 1.5873826742172241, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8530257940292358, + "num_tokens": 222958960.0, + "step": 5846 + }, + { + "epoch": 0.7437984989187126, + "grad_norm": 1.7147079706192017, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.873842179775238, + "num_tokens": 222990864.0, + "step": 5847 + }, + { + "epoch": 0.7439257091973032, + "grad_norm": 1.4450035095214844, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8795953989028931, + "num_tokens": 223026875.0, + "step": 5848 + }, + { + "epoch": 0.7440529194758937, + "grad_norm": 1.5281965732574463, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8522174954414368, + "num_tokens": 223067178.0, + "step": 5849 + }, + { + "epoch": 0.7441801297544841, + "grad_norm": 1.4263708591461182, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8688768148422241, + "num_tokens": 223110588.0, + "step": 5850 + }, + { + "epoch": 0.7443073400330746, + "grad_norm": 1.5367109775543213, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8685188293457031, + "num_tokens": 223145063.0, + "step": 5851 + }, + { + "epoch": 0.7444345503116652, + "grad_norm": 1.5283626317977905, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8581615090370178, + "num_tokens": 223182624.0, + "step": 5852 + }, + { + "epoch": 0.7445617605902557, + "grad_norm": 1.5201257467269897, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8800175189971924, + "num_tokens": 223219893.0, + "step": 5853 + }, + { + "epoch": 0.7446889708688462, + "grad_norm": 1.5964571237564087, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8680452108383179, + "num_tokens": 223254541.0, + "step": 5854 + }, + { + "epoch": 0.7448161811474368, + "grad_norm": 1.411986231803894, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8793814182281494, + "num_tokens": 223296594.0, + "step": 5855 + }, + { + "epoch": 0.7449433914260272, + "grad_norm": 1.708814263343811, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8692640662193298, + "num_tokens": 223326773.0, + "step": 5856 + }, + { + "epoch": 0.7450706017046177, + "grad_norm": 1.4700329303741455, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8642839193344116, + "num_tokens": 223368112.0, + "step": 5857 + }, + { + "epoch": 0.7451978119832082, + "grad_norm": 1.3893556594848633, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8778190612792969, + "num_tokens": 223407039.0, + "step": 5858 + }, + { + "epoch": 0.7453250222617988, + "grad_norm": 1.4617359638214111, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8561050891876221, + "num_tokens": 223448116.0, + "step": 5859 + }, + { + "epoch": 0.7454522325403893, + "grad_norm": 1.4622807502746582, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8696744441986084, + "num_tokens": 223489952.0, + "step": 5860 + }, + { + "epoch": 0.7455794428189798, + "grad_norm": 1.370760202407837, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.88875812292099, + "num_tokens": 223530788.0, + "step": 5861 + }, + { + "epoch": 0.7457066530975703, + "grad_norm": 1.4304351806640625, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8740131258964539, + "num_tokens": 223571698.0, + "step": 5862 + }, + { + "epoch": 0.7458338633761608, + "grad_norm": 1.705182671546936, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8599346876144409, + "num_tokens": 223603785.0, + "step": 5863 + }, + { + "epoch": 0.7459610736547513, + "grad_norm": 1.3964250087738037, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8611904978752136, + "num_tokens": 223643819.0, + "step": 5864 + }, + { + "epoch": 0.7460882839333418, + "grad_norm": 1.3031646013259888, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8798996806144714, + "num_tokens": 223690412.0, + "step": 5865 + }, + { + "epoch": 0.7462154942119323, + "grad_norm": 1.5582084655761719, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8664724230766296, + "num_tokens": 223728101.0, + "step": 5866 + }, + { + "epoch": 0.7463427044905229, + "grad_norm": 1.5834053754806519, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.863146185874939, + "num_tokens": 223764863.0, + "step": 5867 + }, + { + "epoch": 0.7464699147691134, + "grad_norm": 1.5016552209854126, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.879326343536377, + "num_tokens": 223802162.0, + "step": 5868 + }, + { + "epoch": 0.7465971250477038, + "grad_norm": 1.4493767023086548, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8709853887557983, + "num_tokens": 223839519.0, + "step": 5869 + }, + { + "epoch": 0.7467243353262943, + "grad_norm": 1.5386723279953003, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8670918345451355, + "num_tokens": 223877278.0, + "step": 5870 + }, + { + "epoch": 0.7468515456048849, + "grad_norm": 1.465459942817688, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8613487482070923, + "num_tokens": 223918136.0, + "step": 5871 + }, + { + "epoch": 0.7469787558834754, + "grad_norm": 1.4339262247085571, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8767315149307251, + "num_tokens": 223959603.0, + "step": 5872 + }, + { + "epoch": 0.7471059661620659, + "grad_norm": 1.5046546459197998, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8772933483123779, + "num_tokens": 223997337.0, + "step": 5873 + }, + { + "epoch": 0.7472331764406565, + "grad_norm": 1.5998297929763794, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8736799359321594, + "num_tokens": 224030987.0, + "step": 5874 + }, + { + "epoch": 0.7473603867192469, + "grad_norm": 1.4517124891281128, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8643091917037964, + "num_tokens": 224069863.0, + "step": 5875 + }, + { + "epoch": 0.7474875969978374, + "grad_norm": 1.4790745973587036, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8625577688217163, + "num_tokens": 224110943.0, + "step": 5876 + }, + { + "epoch": 0.7476148072764279, + "grad_norm": 1.49668550491333, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8704662919044495, + "num_tokens": 224146635.0, + "step": 5877 + }, + { + "epoch": 0.7477420175550185, + "grad_norm": 1.6773219108581543, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8690885305404663, + "num_tokens": 224178730.0, + "step": 5878 + }, + { + "epoch": 0.747869227833609, + "grad_norm": 1.4253968000411987, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8551923036575317, + "num_tokens": 224220715.0, + "step": 5879 + }, + { + "epoch": 0.7479964381121995, + "grad_norm": 1.423258900642395, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8615336418151855, + "num_tokens": 224262997.0, + "step": 5880 + }, + { + "epoch": 0.7481236483907899, + "grad_norm": 1.5060441493988037, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8660480976104736, + "num_tokens": 224300954.0, + "step": 5881 + }, + { + "epoch": 0.7482508586693805, + "grad_norm": 1.6809253692626953, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8761142492294312, + "num_tokens": 224330507.0, + "step": 5882 + }, + { + "epoch": 0.748378068947971, + "grad_norm": 1.443049430847168, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8736146688461304, + "num_tokens": 224371343.0, + "step": 5883 + }, + { + "epoch": 0.7485052792265615, + "grad_norm": 1.5031089782714844, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8774951696395874, + "num_tokens": 224404804.0, + "step": 5884 + }, + { + "epoch": 0.748632489505152, + "grad_norm": 1.441861629486084, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8792740106582642, + "num_tokens": 224438596.0, + "step": 5885 + }, + { + "epoch": 0.7487596997837426, + "grad_norm": 1.6191726922988892, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8508533239364624, + "num_tokens": 224473136.0, + "step": 5886 + }, + { + "epoch": 0.748886910062333, + "grad_norm": 1.4495205879211426, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8716991543769836, + "num_tokens": 224512097.0, + "step": 5887 + }, + { + "epoch": 0.7490141203409235, + "grad_norm": 1.439446210861206, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8618195652961731, + "num_tokens": 224551698.0, + "step": 5888 + }, + { + "epoch": 0.749141330619514, + "grad_norm": 1.6455141305923462, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8706059455871582, + "num_tokens": 224585038.0, + "step": 5889 + }, + { + "epoch": 0.7492685408981046, + "grad_norm": 1.419728398323059, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8562946915626526, + "num_tokens": 224625185.0, + "step": 5890 + }, + { + "epoch": 0.7493957511766951, + "grad_norm": 1.6084175109863281, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8706569671630859, + "num_tokens": 224655457.0, + "step": 5891 + }, + { + "epoch": 0.7495229614552856, + "grad_norm": 1.4578595161437988, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8664936423301697, + "num_tokens": 224695184.0, + "step": 5892 + }, + { + "epoch": 0.749650171733876, + "grad_norm": 1.6509982347488403, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8687303066253662, + "num_tokens": 224726436.0, + "step": 5893 + }, + { + "epoch": 0.7497773820124666, + "grad_norm": 1.446166753768921, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8756750226020813, + "num_tokens": 224761354.0, + "step": 5894 + }, + { + "epoch": 0.7499045922910571, + "grad_norm": 1.6485426425933838, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8607932925224304, + "num_tokens": 224796080.0, + "step": 5895 + }, + { + "epoch": 0.7500318025696476, + "grad_norm": 1.479177713394165, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8674586415290833, + "num_tokens": 224837956.0, + "step": 5896 + }, + { + "epoch": 0.7501590128482382, + "grad_norm": 1.7064988613128662, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8525941967964172, + "num_tokens": 224871087.0, + "step": 5897 + }, + { + "epoch": 0.7502862231268287, + "grad_norm": 1.5469112396240234, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8562953472137451, + "num_tokens": 224911418.0, + "step": 5898 + }, + { + "epoch": 0.7504134334054191, + "grad_norm": 1.5455400943756104, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8693509101867676, + "num_tokens": 224950424.0, + "step": 5899 + }, + { + "epoch": 0.7505406436840096, + "grad_norm": 1.5545332431793213, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8646035194396973, + "num_tokens": 224983938.0, + "step": 5900 + }, + { + "epoch": 0.7506678539626002, + "grad_norm": 1.5625407695770264, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.86824631690979, + "num_tokens": 225019059.0, + "step": 5901 + }, + { + "epoch": 0.7507950642411907, + "grad_norm": 1.5425522327423096, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8512508869171143, + "num_tokens": 225057607.0, + "step": 5902 + }, + { + "epoch": 0.7509222745197812, + "grad_norm": 1.4850040674209595, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.86627596616745, + "num_tokens": 225096638.0, + "step": 5903 + }, + { + "epoch": 0.7510494847983717, + "grad_norm": 1.581597089767456, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8651381134986877, + "num_tokens": 225130463.0, + "step": 5904 + }, + { + "epoch": 0.7511766950769622, + "grad_norm": 1.5726356506347656, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8697015643119812, + "num_tokens": 225162892.0, + "step": 5905 + }, + { + "epoch": 0.7513039053555527, + "grad_norm": 1.6191036701202393, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8543680906295776, + "num_tokens": 225197170.0, + "step": 5906 + }, + { + "epoch": 0.7514311156341432, + "grad_norm": 1.5190632343292236, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8685507774353027, + "num_tokens": 225233873.0, + "step": 5907 + }, + { + "epoch": 0.7515583259127337, + "grad_norm": 1.4856257438659668, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8720265030860901, + "num_tokens": 225272888.0, + "step": 5908 + }, + { + "epoch": 0.7516855361913243, + "grad_norm": 1.4790420532226562, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8629792928695679, + "num_tokens": 225312087.0, + "step": 5909 + }, + { + "epoch": 0.7518127464699148, + "grad_norm": 1.3880630731582642, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8735426068305969, + "num_tokens": 225354332.0, + "step": 5910 + }, + { + "epoch": 0.7519399567485053, + "grad_norm": 1.4428960084915161, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.88107830286026, + "num_tokens": 225388613.0, + "step": 5911 + }, + { + "epoch": 0.7520671670270958, + "grad_norm": 1.6487513780593872, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8453766107559204, + "num_tokens": 225423858.0, + "step": 5912 + }, + { + "epoch": 0.7521943773056863, + "grad_norm": 1.4639689922332764, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8764379620552063, + "num_tokens": 225462279.0, + "step": 5913 + }, + { + "epoch": 0.7523215875842768, + "grad_norm": 1.584935188293457, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8726781606674194, + "num_tokens": 225497423.0, + "step": 5914 + }, + { + "epoch": 0.7524487978628673, + "grad_norm": 1.5052528381347656, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8604575991630554, + "num_tokens": 225537697.0, + "step": 5915 + }, + { + "epoch": 0.7525760081414579, + "grad_norm": 1.5877232551574707, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8739042282104492, + "num_tokens": 225571071.0, + "step": 5916 + }, + { + "epoch": 0.7527032184200484, + "grad_norm": 1.463454246520996, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8653725385665894, + "num_tokens": 225610735.0, + "step": 5917 + }, + { + "epoch": 0.7528304286986388, + "grad_norm": 1.398701548576355, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8821577429771423, + "num_tokens": 225647306.0, + "step": 5918 + }, + { + "epoch": 0.7529576389772293, + "grad_norm": 1.5391238927841187, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8589067459106445, + "num_tokens": 225686461.0, + "step": 5919 + }, + { + "epoch": 0.7530848492558199, + "grad_norm": 1.3847466707229614, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8717195391654968, + "num_tokens": 225727292.0, + "step": 5920 + }, + { + "epoch": 0.7532120595344104, + "grad_norm": 1.3582075834274292, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8723596334457397, + "num_tokens": 225768761.0, + "step": 5921 + }, + { + "epoch": 0.7533392698130009, + "grad_norm": 1.6307729482650757, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8634752035140991, + "num_tokens": 225803275.0, + "step": 5922 + }, + { + "epoch": 0.7534664800915915, + "grad_norm": 1.478111743927002, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8643358945846558, + "num_tokens": 225840660.0, + "step": 5923 + }, + { + "epoch": 0.7535936903701819, + "grad_norm": 1.600127935409546, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8600994348526001, + "num_tokens": 225875803.0, + "step": 5924 + }, + { + "epoch": 0.7537209006487724, + "grad_norm": 1.5248441696166992, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8726567029953003, + "num_tokens": 225909795.0, + "step": 5925 + }, + { + "epoch": 0.7538481109273629, + "grad_norm": 1.6131746768951416, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.856225311756134, + "num_tokens": 225943659.0, + "step": 5926 + }, + { + "epoch": 0.7539753212059535, + "grad_norm": 1.3257251977920532, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8762689232826233, + "num_tokens": 225985003.0, + "step": 5927 + }, + { + "epoch": 0.754102531484544, + "grad_norm": 1.4915797710418701, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8685166239738464, + "num_tokens": 226022169.0, + "step": 5928 + }, + { + "epoch": 0.7542297417631345, + "grad_norm": 1.4478733539581299, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8879515528678894, + "num_tokens": 226057448.0, + "step": 5929 + }, + { + "epoch": 0.7543569520417249, + "grad_norm": 1.2844558954238892, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8895021080970764, + "num_tokens": 226101784.0, + "step": 5930 + }, + { + "epoch": 0.7544841623203155, + "grad_norm": 1.3962956666946411, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8808135986328125, + "num_tokens": 226141654.0, + "step": 5931 + }, + { + "epoch": 0.754611372598906, + "grad_norm": 1.5223205089569092, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8769370317459106, + "num_tokens": 226175071.0, + "step": 5932 + }, + { + "epoch": 0.7547385828774965, + "grad_norm": 1.4143760204315186, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8704298734664917, + "num_tokens": 226215871.0, + "step": 5933 + }, + { + "epoch": 0.754865793156087, + "grad_norm": 1.5623750686645508, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8583647012710571, + "num_tokens": 226250955.0, + "step": 5934 + }, + { + "epoch": 0.7549930034346776, + "grad_norm": 1.6017588376998901, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8588317036628723, + "num_tokens": 226282793.0, + "step": 5935 + }, + { + "epoch": 0.755120213713268, + "grad_norm": 1.3347957134246826, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8713720440864563, + "num_tokens": 226325926.0, + "step": 5936 + }, + { + "epoch": 0.7552474239918585, + "grad_norm": 1.4677705764770508, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8737388849258423, + "num_tokens": 226364094.0, + "step": 5937 + }, + { + "epoch": 0.755374634270449, + "grad_norm": 1.3499175310134888, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.873708963394165, + "num_tokens": 226406239.0, + "step": 5938 + }, + { + "epoch": 0.7555018445490396, + "grad_norm": 1.4582852125167847, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8682512044906616, + "num_tokens": 226444141.0, + "step": 5939 + }, + { + "epoch": 0.7556290548276301, + "grad_norm": 1.6517374515533447, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8682543039321899, + "num_tokens": 226476668.0, + "step": 5940 + }, + { + "epoch": 0.7557562651062206, + "grad_norm": 1.682448387145996, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8630111217498779, + "num_tokens": 226508024.0, + "step": 5941 + }, + { + "epoch": 0.755883475384811, + "grad_norm": 1.4723868370056152, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8475741744041443, + "num_tokens": 226552221.0, + "step": 5942 + }, + { + "epoch": 0.7560106856634016, + "grad_norm": 1.5786988735198975, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8643152713775635, + "num_tokens": 226588838.0, + "step": 5943 + }, + { + "epoch": 0.7561378959419921, + "grad_norm": 1.4929945468902588, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8751659393310547, + "num_tokens": 226628650.0, + "step": 5944 + }, + { + "epoch": 0.7562651062205826, + "grad_norm": 1.6887860298156738, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8608862161636353, + "num_tokens": 226662363.0, + "step": 5945 + }, + { + "epoch": 0.7563923164991732, + "grad_norm": 1.4899989366531372, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.852374255657196, + "num_tokens": 226702742.0, + "step": 5946 + }, + { + "epoch": 0.7565195267777637, + "grad_norm": 1.4921990633010864, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8671573400497437, + "num_tokens": 226740041.0, + "step": 5947 + }, + { + "epoch": 0.7566467370563541, + "grad_norm": 1.3551031351089478, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8649035692214966, + "num_tokens": 226785978.0, + "step": 5948 + }, + { + "epoch": 0.7567739473349446, + "grad_norm": 1.5129222869873047, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8610552549362183, + "num_tokens": 226823693.0, + "step": 5949 + }, + { + "epoch": 0.7569011576135352, + "grad_norm": 1.5563313961029053, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8799601793289185, + "num_tokens": 226858716.0, + "step": 5950 + }, + { + "epoch": 0.7570283678921257, + "grad_norm": 1.3904657363891602, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8829800486564636, + "num_tokens": 226899398.0, + "step": 5951 + }, + { + "epoch": 0.7571555781707162, + "grad_norm": 1.426259994506836, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8857883214950562, + "num_tokens": 226933960.0, + "step": 5952 + }, + { + "epoch": 0.7572827884493067, + "grad_norm": 1.5087428092956543, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8723264932632446, + "num_tokens": 226968529.0, + "step": 5953 + }, + { + "epoch": 0.7574099987278972, + "grad_norm": 1.383932113647461, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8741834163665771, + "num_tokens": 227010050.0, + "step": 5954 + }, + { + "epoch": 0.7575372090064877, + "grad_norm": 1.3797248601913452, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8787239193916321, + "num_tokens": 227053300.0, + "step": 5955 + }, + { + "epoch": 0.7576644192850782, + "grad_norm": 1.5407310724258423, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8724140524864197, + "num_tokens": 227088741.0, + "step": 5956 + }, + { + "epoch": 0.7577916295636687, + "grad_norm": 1.3675981760025024, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8703899383544922, + "num_tokens": 227131534.0, + "step": 5957 + }, + { + "epoch": 0.7579188398422593, + "grad_norm": 1.6037932634353638, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8794463276863098, + "num_tokens": 227165361.0, + "step": 5958 + }, + { + "epoch": 0.7580460501208498, + "grad_norm": 1.4420170783996582, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8733595609664917, + "num_tokens": 227203304.0, + "step": 5959 + }, + { + "epoch": 0.7581732603994403, + "grad_norm": 1.5643136501312256, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8644357323646545, + "num_tokens": 227240410.0, + "step": 5960 + }, + { + "epoch": 0.7583004706780307, + "grad_norm": 1.484122633934021, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8597661256790161, + "num_tokens": 227278537.0, + "step": 5961 + }, + { + "epoch": 0.7584276809566213, + "grad_norm": 1.4674341678619385, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8501942157745361, + "num_tokens": 227321746.0, + "step": 5962 + }, + { + "epoch": 0.7585548912352118, + "grad_norm": 1.6239830255508423, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8696341514587402, + "num_tokens": 227356487.0, + "step": 5963 + }, + { + "epoch": 0.7586821015138023, + "grad_norm": 1.5644510984420776, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8711808919906616, + "num_tokens": 227391682.0, + "step": 5964 + }, + { + "epoch": 0.7588093117923929, + "grad_norm": 1.5556070804595947, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8819462060928345, + "num_tokens": 227424791.0, + "step": 5965 + }, + { + "epoch": 0.7589365220709834, + "grad_norm": 1.6260939836502075, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8749324083328247, + "num_tokens": 227454257.0, + "step": 5966 + }, + { + "epoch": 0.7590637323495738, + "grad_norm": 1.521231770515442, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8646611571311951, + "num_tokens": 227493310.0, + "step": 5967 + }, + { + "epoch": 0.7591909426281643, + "grad_norm": 1.49383544921875, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8675978183746338, + "num_tokens": 227531587.0, + "step": 5968 + }, + { + "epoch": 0.7593181529067549, + "grad_norm": 1.3830804824829102, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8710916042327881, + "num_tokens": 227572321.0, + "step": 5969 + }, + { + "epoch": 0.7594453631853454, + "grad_norm": 1.6847341060638428, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.860554039478302, + "num_tokens": 227607013.0, + "step": 5970 + }, + { + "epoch": 0.7595725734639359, + "grad_norm": 1.4385226964950562, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.856489896774292, + "num_tokens": 227649779.0, + "step": 5971 + }, + { + "epoch": 0.7596997837425264, + "grad_norm": 1.544290542602539, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8523203134536743, + "num_tokens": 227690191.0, + "step": 5972 + }, + { + "epoch": 0.7598269940211169, + "grad_norm": 1.5357704162597656, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8653078079223633, + "num_tokens": 227725491.0, + "step": 5973 + }, + { + "epoch": 0.7599542042997074, + "grad_norm": 1.4878394603729248, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8856835961341858, + "num_tokens": 227763950.0, + "step": 5974 + }, + { + "epoch": 0.7600814145782979, + "grad_norm": 1.5241689682006836, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8740026950836182, + "num_tokens": 227799200.0, + "step": 5975 + }, + { + "epoch": 0.7602086248568884, + "grad_norm": 1.507131814956665, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.854605495929718, + "num_tokens": 227839100.0, + "step": 5976 + }, + { + "epoch": 0.760335835135479, + "grad_norm": 1.721266508102417, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8555468320846558, + "num_tokens": 227869927.0, + "step": 5977 + }, + { + "epoch": 0.7604630454140695, + "grad_norm": 1.4904147386550903, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8727220296859741, + "num_tokens": 227907598.0, + "step": 5978 + }, + { + "epoch": 0.7605902556926599, + "grad_norm": 1.5968250036239624, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8653601408004761, + "num_tokens": 227940142.0, + "step": 5979 + }, + { + "epoch": 0.7607174659712505, + "grad_norm": 1.4450438022613525, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8604899644851685, + "num_tokens": 227980405.0, + "step": 5980 + }, + { + "epoch": 0.760844676249841, + "grad_norm": 1.3502315282821655, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8771190643310547, + "num_tokens": 228023196.0, + "step": 5981 + }, + { + "epoch": 0.7609718865284315, + "grad_norm": 1.5909985303878784, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8569228649139404, + "num_tokens": 228061532.0, + "step": 5982 + }, + { + "epoch": 0.761099096807022, + "grad_norm": 1.5409497022628784, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8707672953605652, + "num_tokens": 228100474.0, + "step": 5983 + }, + { + "epoch": 0.7612263070856126, + "grad_norm": 1.4643769264221191, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8611568808555603, + "num_tokens": 228138456.0, + "step": 5984 + }, + { + "epoch": 0.761353517364203, + "grad_norm": 1.6457269191741943, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8539267182350159, + "num_tokens": 228174495.0, + "step": 5985 + }, + { + "epoch": 0.7614807276427935, + "grad_norm": 1.5216598510742188, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8697879314422607, + "num_tokens": 228212765.0, + "step": 5986 + }, + { + "epoch": 0.761607937921384, + "grad_norm": 1.422191858291626, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8615137338638306, + "num_tokens": 228252039.0, + "step": 5987 + }, + { + "epoch": 0.7617351481999746, + "grad_norm": 1.502082347869873, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8589242696762085, + "num_tokens": 228291711.0, + "step": 5988 + }, + { + "epoch": 0.7618623584785651, + "grad_norm": 1.4789172410964966, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8644919395446777, + "num_tokens": 228330074.0, + "step": 5989 + }, + { + "epoch": 0.7619895687571556, + "grad_norm": 1.497933030128479, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8829483985900879, + "num_tokens": 228364814.0, + "step": 5990 + }, + { + "epoch": 0.762116779035746, + "grad_norm": 1.5526076555252075, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8716484308242798, + "num_tokens": 228400012.0, + "step": 5991 + }, + { + "epoch": 0.7622439893143366, + "grad_norm": 1.4199631214141846, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.883246123790741, + "num_tokens": 228436684.0, + "step": 5992 + }, + { + "epoch": 0.7623711995929271, + "grad_norm": 1.499255657196045, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.884300947189331, + "num_tokens": 228468905.0, + "step": 5993 + }, + { + "epoch": 0.7624984098715176, + "grad_norm": 1.5962527990341187, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8672375082969666, + "num_tokens": 228502689.0, + "step": 5994 + }, + { + "epoch": 0.7626256201501082, + "grad_norm": 1.4245636463165283, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8722647428512573, + "num_tokens": 228545853.0, + "step": 5995 + }, + { + "epoch": 0.7627528304286987, + "grad_norm": 1.4886610507965088, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.877084493637085, + "num_tokens": 228582029.0, + "step": 5996 + }, + { + "epoch": 0.7628800407072891, + "grad_norm": 1.4194684028625488, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8579622507095337, + "num_tokens": 228624082.0, + "step": 5997 + }, + { + "epoch": 0.7630072509858796, + "grad_norm": 1.4751840829849243, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8700927495956421, + "num_tokens": 228659351.0, + "step": 5998 + }, + { + "epoch": 0.7631344612644702, + "grad_norm": 1.4296079874038696, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8715214133262634, + "num_tokens": 228699981.0, + "step": 5999 + }, + { + "epoch": 0.7632616715430607, + "grad_norm": 1.4543983936309814, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8714470267295837, + "num_tokens": 228743354.0, + "step": 6000 + }, + { + "epoch": 0.7633888818216512, + "grad_norm": 1.5168040990829468, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8542275428771973, + "num_tokens": 228779087.0, + "step": 6001 + }, + { + "epoch": 0.7635160921002417, + "grad_norm": 1.4305604696273804, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8820247054100037, + "num_tokens": 228817140.0, + "step": 6002 + }, + { + "epoch": 0.7636433023788322, + "grad_norm": 1.5508356094360352, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8496126532554626, + "num_tokens": 228858600.0, + "step": 6003 + }, + { + "epoch": 0.7637705126574227, + "grad_norm": 1.4862756729125977, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8804172873497009, + "num_tokens": 228893487.0, + "step": 6004 + }, + { + "epoch": 0.7638977229360132, + "grad_norm": 1.4889637231826782, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.869529664516449, + "num_tokens": 228935592.0, + "step": 6005 + }, + { + "epoch": 0.7640249332146037, + "grad_norm": 1.5701814889907837, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8694314360618591, + "num_tokens": 228971770.0, + "step": 6006 + }, + { + "epoch": 0.7641521434931943, + "grad_norm": 1.3871667385101318, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8649073243141174, + "num_tokens": 229016830.0, + "step": 6007 + }, + { + "epoch": 0.7642793537717848, + "grad_norm": 1.560825228691101, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8633542060852051, + "num_tokens": 229055275.0, + "step": 6008 + }, + { + "epoch": 0.7644065640503753, + "grad_norm": 1.505729079246521, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.884859025478363, + "num_tokens": 229089929.0, + "step": 6009 + }, + { + "epoch": 0.7645337743289657, + "grad_norm": 1.3256336450576782, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8745968341827393, + "num_tokens": 229132994.0, + "step": 6010 + }, + { + "epoch": 0.7646609846075563, + "grad_norm": 1.442602276802063, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.872097373008728, + "num_tokens": 229174447.0, + "step": 6011 + }, + { + "epoch": 0.7647881948861468, + "grad_norm": 1.5452576875686646, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8712295889854431, + "num_tokens": 229208662.0, + "step": 6012 + }, + { + "epoch": 0.7649154051647373, + "grad_norm": 1.4611589908599854, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8797958493232727, + "num_tokens": 229245088.0, + "step": 6013 + }, + { + "epoch": 0.7650426154433279, + "grad_norm": 1.5105313062667847, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8652071952819824, + "num_tokens": 229282847.0, + "step": 6014 + }, + { + "epoch": 0.7651698257219184, + "grad_norm": 1.4933440685272217, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8648834824562073, + "num_tokens": 229320181.0, + "step": 6015 + }, + { + "epoch": 0.7652970360005088, + "grad_norm": 1.4766796827316284, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8574801683425903, + "num_tokens": 229358605.0, + "step": 6016 + }, + { + "epoch": 0.7654242462790993, + "grad_norm": 1.671605110168457, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.870829164981842, + "num_tokens": 229391098.0, + "step": 6017 + }, + { + "epoch": 0.7655514565576899, + "grad_norm": 1.3244646787643433, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8719996213912964, + "num_tokens": 229436244.0, + "step": 6018 + }, + { + "epoch": 0.7656786668362804, + "grad_norm": 1.5858925580978394, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8625926971435547, + "num_tokens": 229470059.0, + "step": 6019 + }, + { + "epoch": 0.7658058771148709, + "grad_norm": 1.514770269393921, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8697659373283386, + "num_tokens": 229505591.0, + "step": 6020 + }, + { + "epoch": 0.7659330873934614, + "grad_norm": 1.5244596004486084, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8752868175506592, + "num_tokens": 229539174.0, + "step": 6021 + }, + { + "epoch": 0.7660602976720519, + "grad_norm": 1.5495564937591553, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8587062358856201, + "num_tokens": 229573374.0, + "step": 6022 + }, + { + "epoch": 0.7661875079506424, + "grad_norm": 1.4060286283493042, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8605389595031738, + "num_tokens": 229615553.0, + "step": 6023 + }, + { + "epoch": 0.7663147182292329, + "grad_norm": 1.469131350517273, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.864665150642395, + "num_tokens": 229654542.0, + "step": 6024 + }, + { + "epoch": 0.7664419285078234, + "grad_norm": 1.464599609375, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8669966459274292, + "num_tokens": 229695292.0, + "step": 6025 + }, + { + "epoch": 0.766569138786414, + "grad_norm": 1.4309040307998657, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8675068616867065, + "num_tokens": 229738077.0, + "step": 6026 + }, + { + "epoch": 0.7666963490650045, + "grad_norm": 1.426558494567871, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8757688403129578, + "num_tokens": 229779000.0, + "step": 6027 + }, + { + "epoch": 0.7668235593435949, + "grad_norm": 1.506140947341919, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8637068271636963, + "num_tokens": 229817864.0, + "step": 6028 + }, + { + "epoch": 0.7669507696221854, + "grad_norm": 1.3980504274368286, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8706937432289124, + "num_tokens": 229855824.0, + "step": 6029 + }, + { + "epoch": 0.767077979900776, + "grad_norm": 1.5513540506362915, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8687095046043396, + "num_tokens": 229889872.0, + "step": 6030 + }, + { + "epoch": 0.7672051901793665, + "grad_norm": 1.612769603729248, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8450956344604492, + "num_tokens": 229927716.0, + "step": 6031 + }, + { + "epoch": 0.767332400457957, + "grad_norm": 1.5387417078018188, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8718513250350952, + "num_tokens": 229960410.0, + "step": 6032 + }, + { + "epoch": 0.7674596107365476, + "grad_norm": 1.5747298002243042, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.86578369140625, + "num_tokens": 229995271.0, + "step": 6033 + }, + { + "epoch": 0.767586821015138, + "grad_norm": 1.4456143379211426, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8547521829605103, + "num_tokens": 230038051.0, + "step": 6034 + }, + { + "epoch": 0.7677140312937285, + "grad_norm": 1.4527709484100342, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8673006296157837, + "num_tokens": 230080909.0, + "step": 6035 + }, + { + "epoch": 0.767841241572319, + "grad_norm": 1.6037039756774902, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8478929996490479, + "num_tokens": 230117168.0, + "step": 6036 + }, + { + "epoch": 0.7679684518509096, + "grad_norm": 1.5820335149765015, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8416024446487427, + "num_tokens": 230156291.0, + "step": 6037 + }, + { + "epoch": 0.7680956621295001, + "grad_norm": 1.4691606760025024, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8527112007141113, + "num_tokens": 230198617.0, + "step": 6038 + }, + { + "epoch": 0.7682228724080906, + "grad_norm": 1.611634612083435, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8631345629692078, + "num_tokens": 230231312.0, + "step": 6039 + }, + { + "epoch": 0.768350082686681, + "grad_norm": 1.541098952293396, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8708749413490295, + "num_tokens": 230265605.0, + "step": 6040 + }, + { + "epoch": 0.7684772929652716, + "grad_norm": 1.4463565349578857, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8589329719543457, + "num_tokens": 230304977.0, + "step": 6041 + }, + { + "epoch": 0.7686045032438621, + "grad_norm": 1.4317559003829956, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8631293773651123, + "num_tokens": 230344870.0, + "step": 6042 + }, + { + "epoch": 0.7687317135224526, + "grad_norm": 1.444175124168396, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8811407685279846, + "num_tokens": 230383803.0, + "step": 6043 + }, + { + "epoch": 0.7688589238010431, + "grad_norm": 1.4226007461547852, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8710854053497314, + "num_tokens": 230423522.0, + "step": 6044 + }, + { + "epoch": 0.7689861340796337, + "grad_norm": 1.3712748289108276, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8552121520042419, + "num_tokens": 230471437.0, + "step": 6045 + }, + { + "epoch": 0.7691133443582241, + "grad_norm": 1.6681053638458252, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8549103736877441, + "num_tokens": 230503300.0, + "step": 6046 + }, + { + "epoch": 0.7692405546368146, + "grad_norm": 1.5045799016952515, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.856498658657074, + "num_tokens": 230544070.0, + "step": 6047 + }, + { + "epoch": 0.7693677649154052, + "grad_norm": 1.3295432329177856, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8714805841445923, + "num_tokens": 230586573.0, + "step": 6048 + }, + { + "epoch": 0.7694949751939957, + "grad_norm": 1.4995334148406982, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8534889817237854, + "num_tokens": 230629136.0, + "step": 6049 + }, + { + "epoch": 0.7696221854725862, + "grad_norm": 1.4831249713897705, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8658854961395264, + "num_tokens": 230667556.0, + "step": 6050 + }, + { + "epoch": 0.7697493957511767, + "grad_norm": 1.4336519241333008, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8666500449180603, + "num_tokens": 230706307.0, + "step": 6051 + }, + { + "epoch": 0.7698766060297672, + "grad_norm": 1.4690762758255005, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8610906600952148, + "num_tokens": 230746023.0, + "step": 6052 + }, + { + "epoch": 0.7700038163083577, + "grad_norm": 1.5905096530914307, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8614561557769775, + "num_tokens": 230781129.0, + "step": 6053 + }, + { + "epoch": 0.7701310265869482, + "grad_norm": 1.4679486751556396, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.864098846912384, + "num_tokens": 230819450.0, + "step": 6054 + }, + { + "epoch": 0.7702582368655387, + "grad_norm": 1.386470079421997, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8764100074768066, + "num_tokens": 230858645.0, + "step": 6055 + }, + { + "epoch": 0.7703854471441293, + "grad_norm": 1.3886957168579102, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8786112666130066, + "num_tokens": 230898414.0, + "step": 6056 + }, + { + "epoch": 0.7705126574227198, + "grad_norm": 1.3909579515457153, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8833144903182983, + "num_tokens": 230934811.0, + "step": 6057 + }, + { + "epoch": 0.7706398677013102, + "grad_norm": 1.5301694869995117, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8612265586853027, + "num_tokens": 230974158.0, + "step": 6058 + }, + { + "epoch": 0.7707670779799007, + "grad_norm": 1.3040997982025146, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.864696741104126, + "num_tokens": 231021018.0, + "step": 6059 + }, + { + "epoch": 0.7708942882584913, + "grad_norm": 1.5540974140167236, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8585094809532166, + "num_tokens": 231059406.0, + "step": 6060 + }, + { + "epoch": 0.7710214985370818, + "grad_norm": 1.3982361555099487, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8765755295753479, + "num_tokens": 231100409.0, + "step": 6061 + }, + { + "epoch": 0.7711487088156723, + "grad_norm": 1.408469557762146, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8775473237037659, + "num_tokens": 231136370.0, + "step": 6062 + }, + { + "epoch": 0.7712759190942629, + "grad_norm": 1.481720209121704, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8754445314407349, + "num_tokens": 231170610.0, + "step": 6063 + }, + { + "epoch": 0.7714031293728534, + "grad_norm": 1.5852078199386597, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8410891890525818, + "num_tokens": 231211000.0, + "step": 6064 + }, + { + "epoch": 0.7715303396514438, + "grad_norm": 1.5043089389801025, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8745716214179993, + "num_tokens": 231250144.0, + "step": 6065 + }, + { + "epoch": 0.7716575499300343, + "grad_norm": 1.498590111732483, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8577910661697388, + "num_tokens": 231290412.0, + "step": 6066 + }, + { + "epoch": 0.7717847602086249, + "grad_norm": 1.4518598318099976, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8662722110748291, + "num_tokens": 231328057.0, + "step": 6067 + }, + { + "epoch": 0.7719119704872154, + "grad_norm": 1.4774994850158691, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8787189722061157, + "num_tokens": 231366039.0, + "step": 6068 + }, + { + "epoch": 0.7720391807658059, + "grad_norm": 1.5482251644134521, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8562308549880981, + "num_tokens": 231404092.0, + "step": 6069 + }, + { + "epoch": 0.7721663910443964, + "grad_norm": 1.485154151916504, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8578394651412964, + "num_tokens": 231444120.0, + "step": 6070 + }, + { + "epoch": 0.7722936013229869, + "grad_norm": 1.472917079925537, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8671250343322754, + "num_tokens": 231481139.0, + "step": 6071 + }, + { + "epoch": 0.7724208116015774, + "grad_norm": 1.4670463800430298, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8873021602630615, + "num_tokens": 231520179.0, + "step": 6072 + }, + { + "epoch": 0.7725480218801679, + "grad_norm": 1.579493522644043, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8567438125610352, + "num_tokens": 231559070.0, + "step": 6073 + }, + { + "epoch": 0.7726752321587584, + "grad_norm": 1.4722304344177246, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8641277551651001, + "num_tokens": 231598451.0, + "step": 6074 + }, + { + "epoch": 0.772802442437349, + "grad_norm": 1.436016321182251, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8634662628173828, + "num_tokens": 231637167.0, + "step": 6075 + }, + { + "epoch": 0.7729296527159395, + "grad_norm": 1.4248491525650024, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8736701607704163, + "num_tokens": 231673341.0, + "step": 6076 + }, + { + "epoch": 0.7730568629945299, + "grad_norm": 1.4353431463241577, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.872504711151123, + "num_tokens": 231711391.0, + "step": 6077 + }, + { + "epoch": 0.7731840732731204, + "grad_norm": 1.3391308784484863, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8754037022590637, + "num_tokens": 231756467.0, + "step": 6078 + }, + { + "epoch": 0.773311283551711, + "grad_norm": 1.5039957761764526, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8598248958587646, + "num_tokens": 231801963.0, + "step": 6079 + }, + { + "epoch": 0.7734384938303015, + "grad_norm": 1.4093749523162842, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8808115720748901, + "num_tokens": 231841708.0, + "step": 6080 + }, + { + "epoch": 0.773565704108892, + "grad_norm": 1.4671409130096436, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8702528476715088, + "num_tokens": 231878142.0, + "step": 6081 + }, + { + "epoch": 0.7736929143874826, + "grad_norm": 1.4346028566360474, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8614141345024109, + "num_tokens": 231920318.0, + "step": 6082 + }, + { + "epoch": 0.773820124666073, + "grad_norm": 1.459631085395813, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8843154907226562, + "num_tokens": 231959218.0, + "step": 6083 + }, + { + "epoch": 0.7739473349446635, + "grad_norm": 1.5286445617675781, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8771716356277466, + "num_tokens": 231994676.0, + "step": 6084 + }, + { + "epoch": 0.774074545223254, + "grad_norm": 1.493268370628357, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8635393381118774, + "num_tokens": 232031896.0, + "step": 6085 + }, + { + "epoch": 0.7742017555018446, + "grad_norm": 1.8206541538238525, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8675227165222168, + "num_tokens": 232072054.0, + "step": 6086 + }, + { + "epoch": 0.7743289657804351, + "grad_norm": 1.5587788820266724, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8552204370498657, + "num_tokens": 232110665.0, + "step": 6087 + }, + { + "epoch": 0.7744561760590256, + "grad_norm": 1.389993667602539, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8711316585540771, + "num_tokens": 232150677.0, + "step": 6088 + }, + { + "epoch": 0.774583386337616, + "grad_norm": 1.6140828132629395, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.860863208770752, + "num_tokens": 232184651.0, + "step": 6089 + }, + { + "epoch": 0.7747105966162066, + "grad_norm": 1.5296125411987305, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8534178733825684, + "num_tokens": 232221580.0, + "step": 6090 + }, + { + "epoch": 0.7748378068947971, + "grad_norm": 1.3927152156829834, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8601057529449463, + "num_tokens": 232265199.0, + "step": 6091 + }, + { + "epoch": 0.7749650171733876, + "grad_norm": 1.4147919416427612, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.873313307762146, + "num_tokens": 232304923.0, + "step": 6092 + }, + { + "epoch": 0.7750922274519781, + "grad_norm": 1.4379467964172363, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8613890409469604, + "num_tokens": 232345933.0, + "step": 6093 + }, + { + "epoch": 0.7752194377305687, + "grad_norm": 1.7672382593154907, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8540771007537842, + "num_tokens": 232379044.0, + "step": 6094 + }, + { + "epoch": 0.7753466480091591, + "grad_norm": 1.701465129852295, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8652555346488953, + "num_tokens": 232409678.0, + "step": 6095 + }, + { + "epoch": 0.7754738582877496, + "grad_norm": 1.5346827507019043, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8695039749145508, + "num_tokens": 232447518.0, + "step": 6096 + }, + { + "epoch": 0.7756010685663401, + "grad_norm": 1.5242252349853516, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8719582557678223, + "num_tokens": 232486802.0, + "step": 6097 + }, + { + "epoch": 0.7757282788449307, + "grad_norm": 1.6966493129730225, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8644994497299194, + "num_tokens": 232523187.0, + "step": 6098 + }, + { + "epoch": 0.7758554891235212, + "grad_norm": 1.3833272457122803, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8783665895462036, + "num_tokens": 232561597.0, + "step": 6099 + }, + { + "epoch": 0.7759826994021117, + "grad_norm": 1.4768811464309692, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8707036972045898, + "num_tokens": 232599497.0, + "step": 6100 + }, + { + "epoch": 0.7761099096807021, + "grad_norm": 1.8353235721588135, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8723030686378479, + "num_tokens": 232624796.0, + "step": 6101 + }, + { + "epoch": 0.7762371199592927, + "grad_norm": 1.3512094020843506, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8842535614967346, + "num_tokens": 232669230.0, + "step": 6102 + }, + { + "epoch": 0.7763643302378832, + "grad_norm": 1.503328800201416, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8520584106445312, + "num_tokens": 232712914.0, + "step": 6103 + }, + { + "epoch": 0.7764915405164737, + "grad_norm": 1.479372501373291, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.87750244140625, + "num_tokens": 232750049.0, + "step": 6104 + }, + { + "epoch": 0.7766187507950643, + "grad_norm": 1.3542375564575195, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8803035020828247, + "num_tokens": 232792051.0, + "step": 6105 + }, + { + "epoch": 0.7767459610736548, + "grad_norm": 1.502213954925537, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8633747100830078, + "num_tokens": 232828765.0, + "step": 6106 + }, + { + "epoch": 0.7768731713522452, + "grad_norm": 1.4832288026809692, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8785905838012695, + "num_tokens": 232864550.0, + "step": 6107 + }, + { + "epoch": 0.7770003816308357, + "grad_norm": 1.42486572265625, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.872538149356842, + "num_tokens": 232904045.0, + "step": 6108 + }, + { + "epoch": 0.7771275919094263, + "grad_norm": 1.4616775512695312, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8631744384765625, + "num_tokens": 232943883.0, + "step": 6109 + }, + { + "epoch": 0.7772548021880168, + "grad_norm": 1.4636483192443848, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8615052103996277, + "num_tokens": 232979671.0, + "step": 6110 + }, + { + "epoch": 0.7773820124666073, + "grad_norm": 1.4259454011917114, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8631677031517029, + "num_tokens": 233019826.0, + "step": 6111 + }, + { + "epoch": 0.7775092227451978, + "grad_norm": 1.5129075050354004, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8744074702262878, + "num_tokens": 233053390.0, + "step": 6112 + }, + { + "epoch": 0.7776364330237884, + "grad_norm": 1.5677136182785034, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.849759578704834, + "num_tokens": 233091382.0, + "step": 6113 + }, + { + "epoch": 0.7777636433023788, + "grad_norm": 1.5103141069412231, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8645853996276855, + "num_tokens": 233127576.0, + "step": 6114 + }, + { + "epoch": 0.7778908535809693, + "grad_norm": 1.4931279420852661, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8572893738746643, + "num_tokens": 233164936.0, + "step": 6115 + }, + { + "epoch": 0.7780180638595598, + "grad_norm": 1.4810749292373657, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8637583255767822, + "num_tokens": 233203896.0, + "step": 6116 + }, + { + "epoch": 0.7781452741381504, + "grad_norm": 1.3998806476593018, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8798918128013611, + "num_tokens": 233243314.0, + "step": 6117 + }, + { + "epoch": 0.7782724844167409, + "grad_norm": 1.446177363395691, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8667079210281372, + "num_tokens": 233286065.0, + "step": 6118 + }, + { + "epoch": 0.7783996946953314, + "grad_norm": 1.459050178527832, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8640190958976746, + "num_tokens": 233325297.0, + "step": 6119 + }, + { + "epoch": 0.7785269049739219, + "grad_norm": 1.4411789178848267, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8728348016738892, + "num_tokens": 233363674.0, + "step": 6120 + }, + { + "epoch": 0.7786541152525124, + "grad_norm": 1.430930256843567, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8750259280204773, + "num_tokens": 233401042.0, + "step": 6121 + }, + { + "epoch": 0.7787813255311029, + "grad_norm": 1.3706413507461548, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8685417175292969, + "num_tokens": 233441486.0, + "step": 6122 + }, + { + "epoch": 0.7789085358096934, + "grad_norm": 1.3321924209594727, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8606876134872437, + "num_tokens": 233485718.0, + "step": 6123 + }, + { + "epoch": 0.779035746088284, + "grad_norm": 1.5971567630767822, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8628409504890442, + "num_tokens": 233523216.0, + "step": 6124 + }, + { + "epoch": 0.7791629563668745, + "grad_norm": 1.5686720609664917, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8547468781471252, + "num_tokens": 233564874.0, + "step": 6125 + }, + { + "epoch": 0.7792901666454649, + "grad_norm": 1.6008505821228027, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.875536322593689, + "num_tokens": 233594127.0, + "step": 6126 + }, + { + "epoch": 0.7794173769240554, + "grad_norm": 1.5089441537857056, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8546721935272217, + "num_tokens": 233632540.0, + "step": 6127 + }, + { + "epoch": 0.779544587202646, + "grad_norm": 1.4574216604232788, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8564660549163818, + "num_tokens": 233677474.0, + "step": 6128 + }, + { + "epoch": 0.7796717974812365, + "grad_norm": 1.51273512840271, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8675363659858704, + "num_tokens": 233715213.0, + "step": 6129 + }, + { + "epoch": 0.779799007759827, + "grad_norm": 1.468805193901062, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8696058988571167, + "num_tokens": 233754471.0, + "step": 6130 + }, + { + "epoch": 0.7799262180384176, + "grad_norm": 1.5976916551589966, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8705133199691772, + "num_tokens": 233791761.0, + "step": 6131 + }, + { + "epoch": 0.780053428317008, + "grad_norm": 1.3904062509536743, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8674057126045227, + "num_tokens": 233834617.0, + "step": 6132 + }, + { + "epoch": 0.7801806385955985, + "grad_norm": 1.5174907445907593, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8700463771820068, + "num_tokens": 233873434.0, + "step": 6133 + }, + { + "epoch": 0.780307848874189, + "grad_norm": 1.5484375953674316, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8555620312690735, + "num_tokens": 233909836.0, + "step": 6134 + }, + { + "epoch": 0.7804350591527796, + "grad_norm": 1.4318650960922241, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8742858171463013, + "num_tokens": 233950040.0, + "step": 6135 + }, + { + "epoch": 0.7805622694313701, + "grad_norm": 1.5545591115951538, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8786965012550354, + "num_tokens": 233981588.0, + "step": 6136 + }, + { + "epoch": 0.7806894797099606, + "grad_norm": 1.602235198020935, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.879042387008667, + "num_tokens": 234016591.0, + "step": 6137 + }, + { + "epoch": 0.780816689988551, + "grad_norm": 1.5682214498519897, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.870668888092041, + "num_tokens": 234051154.0, + "step": 6138 + }, + { + "epoch": 0.7809439002671416, + "grad_norm": 1.5361480712890625, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.877487063407898, + "num_tokens": 234086022.0, + "step": 6139 + }, + { + "epoch": 0.7810711105457321, + "grad_norm": 1.6190146207809448, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.85484778881073, + "num_tokens": 234125627.0, + "step": 6140 + }, + { + "epoch": 0.7811983208243226, + "grad_norm": 1.387190341949463, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8773473501205444, + "num_tokens": 234170949.0, + "step": 6141 + }, + { + "epoch": 0.7813255311029131, + "grad_norm": 1.55597722530365, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8675687313079834, + "num_tokens": 234206564.0, + "step": 6142 + }, + { + "epoch": 0.7814527413815037, + "grad_norm": 1.380981206893921, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8727865815162659, + "num_tokens": 234249024.0, + "step": 6143 + }, + { + "epoch": 0.7815799516600941, + "grad_norm": 1.4432309865951538, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8668113350868225, + "num_tokens": 234289722.0, + "step": 6144 + }, + { + "epoch": 0.7817071619386846, + "grad_norm": 1.634967565536499, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8675190806388855, + "num_tokens": 234321931.0, + "step": 6145 + }, + { + "epoch": 0.7818343722172751, + "grad_norm": 1.453107237815857, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8707499504089355, + "num_tokens": 234361278.0, + "step": 6146 + }, + { + "epoch": 0.7819615824958657, + "grad_norm": 1.623742938041687, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.848242998123169, + "num_tokens": 234394858.0, + "step": 6147 + }, + { + "epoch": 0.7820887927744562, + "grad_norm": 1.6398649215698242, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8579716682434082, + "num_tokens": 234431607.0, + "step": 6148 + }, + { + "epoch": 0.7822160030530467, + "grad_norm": 1.6219419240951538, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8657540082931519, + "num_tokens": 234469830.0, + "step": 6149 + }, + { + "epoch": 0.7823432133316371, + "grad_norm": 1.6593821048736572, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8412238359451294, + "num_tokens": 234504213.0, + "step": 6150 + }, + { + "epoch": 0.7824704236102277, + "grad_norm": 1.6296817064285278, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8611557483673096, + "num_tokens": 234537956.0, + "step": 6151 + }, + { + "epoch": 0.7825976338888182, + "grad_norm": 1.4865326881408691, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8578423857688904, + "num_tokens": 234581179.0, + "step": 6152 + }, + { + "epoch": 0.7827248441674087, + "grad_norm": 1.6152108907699585, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.861059308052063, + "num_tokens": 234616673.0, + "step": 6153 + }, + { + "epoch": 0.7828520544459993, + "grad_norm": 1.3855483531951904, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8676273822784424, + "num_tokens": 234663602.0, + "step": 6154 + }, + { + "epoch": 0.7829792647245898, + "grad_norm": 1.4590165615081787, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8686036467552185, + "num_tokens": 234705400.0, + "step": 6155 + }, + { + "epoch": 0.7831064750031802, + "grad_norm": 1.4398497343063354, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.883124053478241, + "num_tokens": 234742298.0, + "step": 6156 + }, + { + "epoch": 0.7832336852817707, + "grad_norm": 1.5094603300094604, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8577871322631836, + "num_tokens": 234779398.0, + "step": 6157 + }, + { + "epoch": 0.7833608955603613, + "grad_norm": 1.5095360279083252, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8695300817489624, + "num_tokens": 234815318.0, + "step": 6158 + }, + { + "epoch": 0.7834881058389518, + "grad_norm": 1.450142502784729, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8506782054901123, + "num_tokens": 234855479.0, + "step": 6159 + }, + { + "epoch": 0.7836153161175423, + "grad_norm": 1.3976168632507324, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8776681423187256, + "num_tokens": 234894045.0, + "step": 6160 + }, + { + "epoch": 0.7837425263961328, + "grad_norm": 1.7951374053955078, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8543637990951538, + "num_tokens": 234929968.0, + "step": 6161 + }, + { + "epoch": 0.7838697366747234, + "grad_norm": 1.6303203105926514, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.865530788898468, + "num_tokens": 234965081.0, + "step": 6162 + }, + { + "epoch": 0.7839969469533138, + "grad_norm": 1.4311497211456299, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8644026517868042, + "num_tokens": 235002791.0, + "step": 6163 + }, + { + "epoch": 0.7841241572319043, + "grad_norm": 1.574825406074524, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8559824824333191, + "num_tokens": 235037963.0, + "step": 6164 + }, + { + "epoch": 0.7842513675104948, + "grad_norm": 1.4614259004592896, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8518376350402832, + "num_tokens": 235080084.0, + "step": 6165 + }, + { + "epoch": 0.7843785777890854, + "grad_norm": 1.5122902393341064, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8738095760345459, + "num_tokens": 235119583.0, + "step": 6166 + }, + { + "epoch": 0.7845057880676759, + "grad_norm": 1.5506203174591064, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8635246753692627, + "num_tokens": 235155771.0, + "step": 6167 + }, + { + "epoch": 0.7846329983462664, + "grad_norm": 1.605355978012085, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8618435859680176, + "num_tokens": 235188124.0, + "step": 6168 + }, + { + "epoch": 0.7847602086248568, + "grad_norm": 1.4179089069366455, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8575147390365601, + "num_tokens": 235234070.0, + "step": 6169 + }, + { + "epoch": 0.7848874189034474, + "grad_norm": 1.762223243713379, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8578396439552307, + "num_tokens": 235270653.0, + "step": 6170 + }, + { + "epoch": 0.7850146291820379, + "grad_norm": 1.488446593284607, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8690957427024841, + "num_tokens": 235309267.0, + "step": 6171 + }, + { + "epoch": 0.7851418394606284, + "grad_norm": 1.491598129272461, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8718688488006592, + "num_tokens": 235349406.0, + "step": 6172 + }, + { + "epoch": 0.785269049739219, + "grad_norm": 1.6051228046417236, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8655162453651428, + "num_tokens": 235382979.0, + "step": 6173 + }, + { + "epoch": 0.7853962600178095, + "grad_norm": 1.4316012859344482, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8819639682769775, + "num_tokens": 235417539.0, + "step": 6174 + }, + { + "epoch": 0.7855234702963999, + "grad_norm": 1.5105860233306885, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8598443865776062, + "num_tokens": 235452516.0, + "step": 6175 + }, + { + "epoch": 0.7856506805749904, + "grad_norm": 1.411346435546875, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8735159635543823, + "num_tokens": 235490306.0, + "step": 6176 + }, + { + "epoch": 0.785777890853581, + "grad_norm": 1.4204344749450684, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8632850050926208, + "num_tokens": 235530242.0, + "step": 6177 + }, + { + "epoch": 0.7859051011321715, + "grad_norm": 1.6558047533035278, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8626641035079956, + "num_tokens": 235564606.0, + "step": 6178 + }, + { + "epoch": 0.786032311410762, + "grad_norm": 1.5779732465744019, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8689359426498413, + "num_tokens": 235601912.0, + "step": 6179 + }, + { + "epoch": 0.7861595216893525, + "grad_norm": 1.6810247898101807, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.864439845085144, + "num_tokens": 235639734.0, + "step": 6180 + }, + { + "epoch": 0.786286731967943, + "grad_norm": 1.4879088401794434, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8710707426071167, + "num_tokens": 235674114.0, + "step": 6181 + }, + { + "epoch": 0.7864139422465335, + "grad_norm": 1.702836513519287, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.859494686126709, + "num_tokens": 235708413.0, + "step": 6182 + }, + { + "epoch": 0.786541152525124, + "grad_norm": 1.5162206888198853, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8723165988922119, + "num_tokens": 235740733.0, + "step": 6183 + }, + { + "epoch": 0.7866683628037145, + "grad_norm": 1.4254798889160156, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8758654594421387, + "num_tokens": 235781193.0, + "step": 6184 + }, + { + "epoch": 0.7867955730823051, + "grad_norm": 1.7333555221557617, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8613919019699097, + "num_tokens": 235818925.0, + "step": 6185 + }, + { + "epoch": 0.7869227833608956, + "grad_norm": 1.5896897315979004, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8480818867683411, + "num_tokens": 235858621.0, + "step": 6186 + }, + { + "epoch": 0.787049993639486, + "grad_norm": 1.5235778093338013, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8659929037094116, + "num_tokens": 235896240.0, + "step": 6187 + }, + { + "epoch": 0.7871772039180766, + "grad_norm": 1.370449423789978, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8828018307685852, + "num_tokens": 235937867.0, + "step": 6188 + }, + { + "epoch": 0.7873044141966671, + "grad_norm": 1.5313564538955688, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8777812719345093, + "num_tokens": 235972733.0, + "step": 6189 + }, + { + "epoch": 0.7874316244752576, + "grad_norm": 1.4484821557998657, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8793082237243652, + "num_tokens": 236010580.0, + "step": 6190 + }, + { + "epoch": 0.7875588347538481, + "grad_norm": 1.4200440645217896, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8675263524055481, + "num_tokens": 236050842.0, + "step": 6191 + }, + { + "epoch": 0.7876860450324387, + "grad_norm": 1.50411856174469, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8823046684265137, + "num_tokens": 236086441.0, + "step": 6192 + }, + { + "epoch": 0.7878132553110291, + "grad_norm": 1.4245281219482422, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8720453381538391, + "num_tokens": 236125205.0, + "step": 6193 + }, + { + "epoch": 0.7879404655896196, + "grad_norm": 1.4935818910598755, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8703439831733704, + "num_tokens": 236161788.0, + "step": 6194 + }, + { + "epoch": 0.7880676758682101, + "grad_norm": 1.4471074342727661, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8736629486083984, + "num_tokens": 236203268.0, + "step": 6195 + }, + { + "epoch": 0.7881948861468007, + "grad_norm": 1.4116895198822021, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8776271343231201, + "num_tokens": 236241386.0, + "step": 6196 + }, + { + "epoch": 0.7883220964253912, + "grad_norm": 1.6058509349822998, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8676779866218567, + "num_tokens": 236271231.0, + "step": 6197 + }, + { + "epoch": 0.7884493067039817, + "grad_norm": 1.545201063156128, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8792826533317566, + "num_tokens": 236306569.0, + "step": 6198 + }, + { + "epoch": 0.7885765169825721, + "grad_norm": 1.4775460958480835, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.86855149269104, + "num_tokens": 236345441.0, + "step": 6199 + }, + { + "epoch": 0.7887037272611627, + "grad_norm": 1.4189058542251587, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8798186779022217, + "num_tokens": 236384452.0, + "step": 6200 + }, + { + "epoch": 0.7888309375397532, + "grad_norm": 1.3411787748336792, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8859930634498596, + "num_tokens": 236426720.0, + "step": 6201 + }, + { + "epoch": 0.7889581478183437, + "grad_norm": 1.5317368507385254, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8617075085639954, + "num_tokens": 236464411.0, + "step": 6202 + }, + { + "epoch": 0.7890853580969343, + "grad_norm": 1.326809048652649, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8761736154556274, + "num_tokens": 236507966.0, + "step": 6203 + }, + { + "epoch": 0.7892125683755248, + "grad_norm": 1.7833465337753296, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8662845492362976, + "num_tokens": 236540305.0, + "step": 6204 + }, + { + "epoch": 0.7893397786541152, + "grad_norm": 1.4795310497283936, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8559237718582153, + "num_tokens": 236582281.0, + "step": 6205 + }, + { + "epoch": 0.7894669889327057, + "grad_norm": 1.5134294033050537, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8593721389770508, + "num_tokens": 236620066.0, + "step": 6206 + }, + { + "epoch": 0.7895941992112963, + "grad_norm": 2.0092763900756836, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8609463572502136, + "num_tokens": 236651836.0, + "step": 6207 + }, + { + "epoch": 0.7897214094898868, + "grad_norm": 1.5291177034378052, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8570841550827026, + "num_tokens": 236688331.0, + "step": 6208 + }, + { + "epoch": 0.7898486197684773, + "grad_norm": 1.451377034187317, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8792774081230164, + "num_tokens": 236725609.0, + "step": 6209 + }, + { + "epoch": 0.7899758300470678, + "grad_norm": 1.5189927816390991, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8603193759918213, + "num_tokens": 236765050.0, + "step": 6210 + }, + { + "epoch": 0.7901030403256584, + "grad_norm": 1.4218615293502808, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8638824224472046, + "num_tokens": 236806280.0, + "step": 6211 + }, + { + "epoch": 0.7902302506042488, + "grad_norm": 1.4254707098007202, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8728928565979004, + "num_tokens": 236844569.0, + "step": 6212 + }, + { + "epoch": 0.7903574608828393, + "grad_norm": 1.5545141696929932, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.864096999168396, + "num_tokens": 236880192.0, + "step": 6213 + }, + { + "epoch": 0.7904846711614298, + "grad_norm": 1.6020269393920898, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.861861526966095, + "num_tokens": 236913614.0, + "step": 6214 + }, + { + "epoch": 0.7906118814400204, + "grad_norm": 1.45125150680542, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8745385408401489, + "num_tokens": 236957311.0, + "step": 6215 + }, + { + "epoch": 0.7907390917186109, + "grad_norm": 1.551125407218933, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8775234222412109, + "num_tokens": 236992903.0, + "step": 6216 + }, + { + "epoch": 0.7908663019972014, + "grad_norm": 1.4837254285812378, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8727034330368042, + "num_tokens": 237031813.0, + "step": 6217 + }, + { + "epoch": 0.7909935122757918, + "grad_norm": 1.614355206489563, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8583537936210632, + "num_tokens": 237068124.0, + "step": 6218 + }, + { + "epoch": 0.7911207225543824, + "grad_norm": 1.5010766983032227, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8886240720748901, + "num_tokens": 237100402.0, + "step": 6219 + }, + { + "epoch": 0.7912479328329729, + "grad_norm": 1.5072557926177979, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8687621355056763, + "num_tokens": 237137895.0, + "step": 6220 + }, + { + "epoch": 0.7913751431115634, + "grad_norm": 1.4718053340911865, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.870916485786438, + "num_tokens": 237177888.0, + "step": 6221 + }, + { + "epoch": 0.791502353390154, + "grad_norm": 1.5201137065887451, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8618334531784058, + "num_tokens": 237213993.0, + "step": 6222 + }, + { + "epoch": 0.7916295636687445, + "grad_norm": 1.597264051437378, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8453353643417358, + "num_tokens": 237249053.0, + "step": 6223 + }, + { + "epoch": 0.7917567739473349, + "grad_norm": 1.5519057512283325, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8611671924591064, + "num_tokens": 237283322.0, + "step": 6224 + }, + { + "epoch": 0.7918839842259254, + "grad_norm": 1.5297749042510986, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8769080638885498, + "num_tokens": 237319274.0, + "step": 6225 + }, + { + "epoch": 0.792011194504516, + "grad_norm": 1.6524271965026855, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8553835153579712, + "num_tokens": 237352411.0, + "step": 6226 + }, + { + "epoch": 0.7921384047831065, + "grad_norm": 1.3448034524917603, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8636901378631592, + "num_tokens": 237396079.0, + "step": 6227 + }, + { + "epoch": 0.792265615061697, + "grad_norm": 1.4895143508911133, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8758910894393921, + "num_tokens": 237434519.0, + "step": 6228 + }, + { + "epoch": 0.7923928253402875, + "grad_norm": 1.5576790571212769, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.870169997215271, + "num_tokens": 237468321.0, + "step": 6229 + }, + { + "epoch": 0.792520035618878, + "grad_norm": 1.5770254135131836, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8786957263946533, + "num_tokens": 237500386.0, + "step": 6230 + }, + { + "epoch": 0.7926472458974685, + "grad_norm": 1.3637018203735352, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8697565793991089, + "num_tokens": 237544596.0, + "step": 6231 + }, + { + "epoch": 0.792774456176059, + "grad_norm": 1.5245267152786255, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8691886067390442, + "num_tokens": 237578828.0, + "step": 6232 + }, + { + "epoch": 0.7929016664546495, + "grad_norm": 1.5004652738571167, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8618042469024658, + "num_tokens": 237616534.0, + "step": 6233 + }, + { + "epoch": 0.7930288767332401, + "grad_norm": 1.549278974533081, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8760204315185547, + "num_tokens": 237651136.0, + "step": 6234 + }, + { + "epoch": 0.7931560870118306, + "grad_norm": 1.8154630661010742, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8730639219284058, + "num_tokens": 237680542.0, + "step": 6235 + }, + { + "epoch": 0.793283297290421, + "grad_norm": 1.4458657503128052, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.865371584892273, + "num_tokens": 237719816.0, + "step": 6236 + }, + { + "epoch": 0.7934105075690115, + "grad_norm": 1.5646002292633057, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8746086359024048, + "num_tokens": 237752241.0, + "step": 6237 + }, + { + "epoch": 0.7935377178476021, + "grad_norm": 1.3479828834533691, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8655388355255127, + "num_tokens": 237797089.0, + "step": 6238 + }, + { + "epoch": 0.7936649281261926, + "grad_norm": 1.3393192291259766, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.862970232963562, + "num_tokens": 237845068.0, + "step": 6239 + }, + { + "epoch": 0.7937921384047831, + "grad_norm": 1.5049569606781006, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8636642694473267, + "num_tokens": 237883769.0, + "step": 6240 + }, + { + "epoch": 0.7939193486833737, + "grad_norm": 1.678127646446228, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8463034629821777, + "num_tokens": 237921691.0, + "step": 6241 + }, + { + "epoch": 0.7940465589619641, + "grad_norm": 1.4600144624710083, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8618461489677429, + "num_tokens": 237961523.0, + "step": 6242 + }, + { + "epoch": 0.7941737692405546, + "grad_norm": 1.5209853649139404, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8483542203903198, + "num_tokens": 237999220.0, + "step": 6243 + }, + { + "epoch": 0.7943009795191451, + "grad_norm": 1.6669977903366089, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8629924058914185, + "num_tokens": 238032650.0, + "step": 6244 + }, + { + "epoch": 0.7944281897977357, + "grad_norm": 1.511665940284729, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8712193965911865, + "num_tokens": 238067919.0, + "step": 6245 + }, + { + "epoch": 0.7945554000763262, + "grad_norm": 1.4915342330932617, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8714636564254761, + "num_tokens": 238102904.0, + "step": 6246 + }, + { + "epoch": 0.7946826103549167, + "grad_norm": 1.4908117055892944, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8576605319976807, + "num_tokens": 238142566.0, + "step": 6247 + }, + { + "epoch": 0.7948098206335071, + "grad_norm": 1.514223575592041, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8720571398735046, + "num_tokens": 238174959.0, + "step": 6248 + }, + { + "epoch": 0.7949370309120977, + "grad_norm": 1.3681526184082031, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8801875114440918, + "num_tokens": 238216952.0, + "step": 6249 + }, + { + "epoch": 0.7950642411906882, + "grad_norm": 1.5563174486160278, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8538442850112915, + "num_tokens": 238256406.0, + "step": 6250 + }, + { + "epoch": 0.7951914514692787, + "grad_norm": 1.433165192604065, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8647851347923279, + "num_tokens": 238296869.0, + "step": 6251 + }, + { + "epoch": 0.7953186617478692, + "grad_norm": 1.5221437215805054, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8658578395843506, + "num_tokens": 238335777.0, + "step": 6252 + }, + { + "epoch": 0.7954458720264598, + "grad_norm": 1.523788332939148, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8701679110527039, + "num_tokens": 238373561.0, + "step": 6253 + }, + { + "epoch": 0.7955730823050502, + "grad_norm": 1.4150358438491821, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8745848536491394, + "num_tokens": 238412271.0, + "step": 6254 + }, + { + "epoch": 0.7957002925836407, + "grad_norm": 1.4856680631637573, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8638412356376648, + "num_tokens": 238451640.0, + "step": 6255 + }, + { + "epoch": 0.7958275028622313, + "grad_norm": 1.3631857633590698, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8711152076721191, + "num_tokens": 238496469.0, + "step": 6256 + }, + { + "epoch": 0.7959547131408218, + "grad_norm": 1.619376540184021, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8481512665748596, + "num_tokens": 238531353.0, + "step": 6257 + }, + { + "epoch": 0.7960819234194123, + "grad_norm": 1.4357906579971313, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8614580631256104, + "num_tokens": 238572158.0, + "step": 6258 + }, + { + "epoch": 0.7962091336980028, + "grad_norm": 1.4680092334747314, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.868266224861145, + "num_tokens": 238610294.0, + "step": 6259 + }, + { + "epoch": 0.7963363439765934, + "grad_norm": 1.4218223094940186, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8765991926193237, + "num_tokens": 238652188.0, + "step": 6260 + }, + { + "epoch": 0.7964635542551838, + "grad_norm": 1.3944108486175537, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8693460822105408, + "num_tokens": 238690542.0, + "step": 6261 + }, + { + "epoch": 0.7965907645337743, + "grad_norm": 1.5331988334655762, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8753748536109924, + "num_tokens": 238725344.0, + "step": 6262 + }, + { + "epoch": 0.7967179748123648, + "grad_norm": 1.4000555276870728, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8425638675689697, + "num_tokens": 238773251.0, + "step": 6263 + }, + { + "epoch": 0.7968451850909554, + "grad_norm": 1.3670557737350464, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8755569458007812, + "num_tokens": 238813930.0, + "step": 6264 + }, + { + "epoch": 0.7969723953695459, + "grad_norm": 1.498533844947815, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8649039268493652, + "num_tokens": 238851433.0, + "step": 6265 + }, + { + "epoch": 0.7970996056481364, + "grad_norm": 1.4786713123321533, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8754199147224426, + "num_tokens": 238888437.0, + "step": 6266 + }, + { + "epoch": 0.7972268159267268, + "grad_norm": 1.3668750524520874, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8769983053207397, + "num_tokens": 238929002.0, + "step": 6267 + }, + { + "epoch": 0.7973540262053174, + "grad_norm": 1.4037816524505615, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8665902614593506, + "num_tokens": 238971953.0, + "step": 6268 + }, + { + "epoch": 0.7974812364839079, + "grad_norm": 1.5289795398712158, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8706252574920654, + "num_tokens": 239010661.0, + "step": 6269 + }, + { + "epoch": 0.7976084467624984, + "grad_norm": 1.535603642463684, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8644959926605225, + "num_tokens": 239050225.0, + "step": 6270 + }, + { + "epoch": 0.797735657041089, + "grad_norm": 1.5630550384521484, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8605210185050964, + "num_tokens": 239082912.0, + "step": 6271 + }, + { + "epoch": 0.7978628673196795, + "grad_norm": 1.545335292816162, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8778104782104492, + "num_tokens": 239115972.0, + "step": 6272 + }, + { + "epoch": 0.7979900775982699, + "grad_norm": 1.5005784034729004, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8551410436630249, + "num_tokens": 239153707.0, + "step": 6273 + }, + { + "epoch": 0.7981172878768604, + "grad_norm": 1.4138500690460205, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8639032244682312, + "num_tokens": 239194427.0, + "step": 6274 + }, + { + "epoch": 0.798244498155451, + "grad_norm": 1.376667857170105, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8770917654037476, + "num_tokens": 239234829.0, + "step": 6275 + }, + { + "epoch": 0.7983717084340415, + "grad_norm": 1.3824117183685303, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8851933479309082, + "num_tokens": 239273603.0, + "step": 6276 + }, + { + "epoch": 0.798498918712632, + "grad_norm": 1.4109776020050049, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8665057420730591, + "num_tokens": 239315222.0, + "step": 6277 + }, + { + "epoch": 0.7986261289912225, + "grad_norm": 1.5393906831741333, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8502968549728394, + "num_tokens": 239352354.0, + "step": 6278 + }, + { + "epoch": 0.798753339269813, + "grad_norm": 1.50736665725708, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.862893283367157, + "num_tokens": 239388511.0, + "step": 6279 + }, + { + "epoch": 0.7988805495484035, + "grad_norm": 1.6719038486480713, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8567830920219421, + "num_tokens": 239422124.0, + "step": 6280 + }, + { + "epoch": 0.799007759826994, + "grad_norm": 1.5098050832748413, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.860499382019043, + "num_tokens": 239459292.0, + "step": 6281 + }, + { + "epoch": 0.7991349701055845, + "grad_norm": 1.7373924255371094, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8518693447113037, + "num_tokens": 239492660.0, + "step": 6282 + }, + { + "epoch": 0.7992621803841751, + "grad_norm": 1.4440373182296753, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8781967163085938, + "num_tokens": 239531377.0, + "step": 6283 + }, + { + "epoch": 0.7993893906627656, + "grad_norm": 1.5464868545532227, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8820199966430664, + "num_tokens": 239573311.0, + "step": 6284 + }, + { + "epoch": 0.799516600941356, + "grad_norm": 1.439914584159851, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8696995973587036, + "num_tokens": 239613293.0, + "step": 6285 + }, + { + "epoch": 0.7996438112199465, + "grad_norm": 1.5104998350143433, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8759747743606567, + "num_tokens": 239652219.0, + "step": 6286 + }, + { + "epoch": 0.7997710214985371, + "grad_norm": 1.5475112199783325, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8607598543167114, + "num_tokens": 239688538.0, + "step": 6287 + }, + { + "epoch": 0.7998982317771276, + "grad_norm": 1.5152091979980469, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8552109003067017, + "num_tokens": 239727895.0, + "step": 6288 + }, + { + "epoch": 0.8000254420557181, + "grad_norm": 1.4543455839157104, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8549563884735107, + "num_tokens": 239770427.0, + "step": 6289 + }, + { + "epoch": 0.8001526523343087, + "grad_norm": 1.5279946327209473, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.865140974521637, + "num_tokens": 239809150.0, + "step": 6290 + }, + { + "epoch": 0.8002798626128991, + "grad_norm": 1.3787198066711426, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8694460988044739, + "num_tokens": 239850263.0, + "step": 6291 + }, + { + "epoch": 0.8004070728914896, + "grad_norm": 1.4042065143585205, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8719887137413025, + "num_tokens": 239890040.0, + "step": 6292 + }, + { + "epoch": 0.8005342831700801, + "grad_norm": 1.353838562965393, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8739255666732788, + "num_tokens": 239933379.0, + "step": 6293 + }, + { + "epoch": 0.8006614934486707, + "grad_norm": 1.4643778800964355, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8653576374053955, + "num_tokens": 239971889.0, + "step": 6294 + }, + { + "epoch": 0.8007887037272612, + "grad_norm": 1.476911187171936, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8675719499588013, + "num_tokens": 240008866.0, + "step": 6295 + }, + { + "epoch": 0.8009159140058517, + "grad_norm": 1.3456400632858276, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8724886178970337, + "num_tokens": 240054747.0, + "step": 6296 + }, + { + "epoch": 0.8010431242844421, + "grad_norm": 1.5141927003860474, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.870120108127594, + "num_tokens": 240091673.0, + "step": 6297 + }, + { + "epoch": 0.8011703345630327, + "grad_norm": 1.534747838973999, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.865837574005127, + "num_tokens": 240128571.0, + "step": 6298 + }, + { + "epoch": 0.8012975448416232, + "grad_norm": 1.4776504039764404, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8666898608207703, + "num_tokens": 240169489.0, + "step": 6299 + }, + { + "epoch": 0.8014247551202137, + "grad_norm": 1.6002495288848877, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8774410486221313, + "num_tokens": 240204001.0, + "step": 6300 + }, + { + "epoch": 0.8015519653988042, + "grad_norm": 1.5130611658096313, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8754051327705383, + "num_tokens": 240242853.0, + "step": 6301 + }, + { + "epoch": 0.8016791756773948, + "grad_norm": 1.6040970087051392, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8684583902359009, + "num_tokens": 240278901.0, + "step": 6302 + }, + { + "epoch": 0.8018063859559852, + "grad_norm": 1.4076241254806519, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8796674013137817, + "num_tokens": 240314827.0, + "step": 6303 + }, + { + "epoch": 0.8019335962345757, + "grad_norm": 1.5228244066238403, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.869064450263977, + "num_tokens": 240351162.0, + "step": 6304 + }, + { + "epoch": 0.8020608065131662, + "grad_norm": 1.4546393156051636, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8598370552062988, + "num_tokens": 240393417.0, + "step": 6305 + }, + { + "epoch": 0.8021880167917568, + "grad_norm": 1.6963391304016113, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8491226434707642, + "num_tokens": 240426785.0, + "step": 6306 + }, + { + "epoch": 0.8023152270703473, + "grad_norm": 1.3642271757125854, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8708441257476807, + "num_tokens": 240467654.0, + "step": 6307 + }, + { + "epoch": 0.8024424373489378, + "grad_norm": 1.501356840133667, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8626769781112671, + "num_tokens": 240504955.0, + "step": 6308 + }, + { + "epoch": 0.8025696476275284, + "grad_norm": 1.7130018472671509, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8804532885551453, + "num_tokens": 240533919.0, + "step": 6309 + }, + { + "epoch": 0.8026968579061188, + "grad_norm": 1.5669822692871094, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8711071014404297, + "num_tokens": 240565835.0, + "step": 6310 + }, + { + "epoch": 0.8028240681847093, + "grad_norm": 1.528630018234253, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.867140531539917, + "num_tokens": 240601129.0, + "step": 6311 + }, + { + "epoch": 0.8029512784632998, + "grad_norm": 1.5993889570236206, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.856406569480896, + "num_tokens": 240636007.0, + "step": 6312 + }, + { + "epoch": 0.8030784887418904, + "grad_norm": 1.4816813468933105, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8791043758392334, + "num_tokens": 240674059.0, + "step": 6313 + }, + { + "epoch": 0.8032056990204809, + "grad_norm": 1.4883233308792114, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8757866024971008, + "num_tokens": 240712013.0, + "step": 6314 + }, + { + "epoch": 0.8033329092990714, + "grad_norm": 1.479814887046814, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8790835738182068, + "num_tokens": 240748194.0, + "step": 6315 + }, + { + "epoch": 0.8034601195776618, + "grad_norm": 1.3915941715240479, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8582051992416382, + "num_tokens": 240793187.0, + "step": 6316 + }, + { + "epoch": 0.8035873298562524, + "grad_norm": 1.4223507642745972, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.861659049987793, + "num_tokens": 240835973.0, + "step": 6317 + }, + { + "epoch": 0.8037145401348429, + "grad_norm": 1.4911117553710938, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8724054098129272, + "num_tokens": 240871772.0, + "step": 6318 + }, + { + "epoch": 0.8038417504134334, + "grad_norm": 1.5079494714736938, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.862542986869812, + "num_tokens": 240912249.0, + "step": 6319 + }, + { + "epoch": 0.803968960692024, + "grad_norm": 1.4210598468780518, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8688544631004333, + "num_tokens": 240953137.0, + "step": 6320 + }, + { + "epoch": 0.8040961709706145, + "grad_norm": 1.4083011150360107, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8749365210533142, + "num_tokens": 241001226.0, + "step": 6321 + }, + { + "epoch": 0.8042233812492049, + "grad_norm": 1.5784021615982056, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8697057962417603, + "num_tokens": 241036006.0, + "step": 6322 + }, + { + "epoch": 0.8043505915277954, + "grad_norm": 1.3716269731521606, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8734917044639587, + "num_tokens": 241077819.0, + "step": 6323 + }, + { + "epoch": 0.804477801806386, + "grad_norm": 1.4685814380645752, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8666293025016785, + "num_tokens": 241115507.0, + "step": 6324 + }, + { + "epoch": 0.8046050120849765, + "grad_norm": 1.5107953548431396, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8748672604560852, + "num_tokens": 241154818.0, + "step": 6325 + }, + { + "epoch": 0.804732222363567, + "grad_norm": 1.5034971237182617, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8618655204772949, + "num_tokens": 241193357.0, + "step": 6326 + }, + { + "epoch": 0.8048594326421575, + "grad_norm": 1.4447598457336426, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8718869686126709, + "num_tokens": 241232268.0, + "step": 6327 + }, + { + "epoch": 0.804986642920748, + "grad_norm": 1.4799528121948242, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8564850687980652, + "num_tokens": 241270607.0, + "step": 6328 + }, + { + "epoch": 0.8051138531993385, + "grad_norm": 1.7699260711669922, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8834396600723267, + "num_tokens": 241310997.0, + "step": 6329 + }, + { + "epoch": 0.805241063477929, + "grad_norm": 1.5720829963684082, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.880600094795227, + "num_tokens": 241346057.0, + "step": 6330 + }, + { + "epoch": 0.8053682737565195, + "grad_norm": 1.6243149042129517, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.859353244304657, + "num_tokens": 241383988.0, + "step": 6331 + }, + { + "epoch": 0.8054954840351101, + "grad_norm": 1.5989803075790405, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.861574113368988, + "num_tokens": 241419732.0, + "step": 6332 + }, + { + "epoch": 0.8056226943137006, + "grad_norm": 1.4907951354980469, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8680171370506287, + "num_tokens": 241458688.0, + "step": 6333 + }, + { + "epoch": 0.805749904592291, + "grad_norm": 1.5628222227096558, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8772923946380615, + "num_tokens": 241492304.0, + "step": 6334 + }, + { + "epoch": 0.8058771148708815, + "grad_norm": 1.4771698713302612, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8695712685585022, + "num_tokens": 241529659.0, + "step": 6335 + }, + { + "epoch": 0.8060043251494721, + "grad_norm": 1.4553054571151733, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8644603490829468, + "num_tokens": 241573291.0, + "step": 6336 + }, + { + "epoch": 0.8061315354280626, + "grad_norm": 1.4923828840255737, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8681418895721436, + "num_tokens": 241614755.0, + "step": 6337 + }, + { + "epoch": 0.8062587457066531, + "grad_norm": 1.5321940183639526, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8609684109687805, + "num_tokens": 241652301.0, + "step": 6338 + }, + { + "epoch": 0.8063859559852437, + "grad_norm": 1.5299577713012695, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8542878031730652, + "num_tokens": 241695939.0, + "step": 6339 + }, + { + "epoch": 0.8065131662638341, + "grad_norm": 1.457984209060669, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8781782984733582, + "num_tokens": 241730670.0, + "step": 6340 + }, + { + "epoch": 0.8066403765424246, + "grad_norm": 1.6320374011993408, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8570107221603394, + "num_tokens": 241764528.0, + "step": 6341 + }, + { + "epoch": 0.8067675868210151, + "grad_norm": 1.3967537879943848, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8769940137863159, + "num_tokens": 241805881.0, + "step": 6342 + }, + { + "epoch": 0.8068947970996057, + "grad_norm": 1.510930061340332, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8591461181640625, + "num_tokens": 241846141.0, + "step": 6343 + }, + { + "epoch": 0.8070220073781962, + "grad_norm": 1.4022068977355957, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8682414293289185, + "num_tokens": 241886970.0, + "step": 6344 + }, + { + "epoch": 0.8071492176567867, + "grad_norm": 1.4721274375915527, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8731307983398438, + "num_tokens": 241928722.0, + "step": 6345 + }, + { + "epoch": 0.8072764279353771, + "grad_norm": 1.4902324676513672, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8707047700881958, + "num_tokens": 241967176.0, + "step": 6346 + }, + { + "epoch": 0.8074036382139677, + "grad_norm": 1.556422472000122, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8627941012382507, + "num_tokens": 242004201.0, + "step": 6347 + }, + { + "epoch": 0.8075308484925582, + "grad_norm": 1.3638105392456055, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8720904588699341, + "num_tokens": 242043571.0, + "step": 6348 + }, + { + "epoch": 0.8076580587711487, + "grad_norm": 1.4202799797058105, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8615528345108032, + "num_tokens": 242083637.0, + "step": 6349 + }, + { + "epoch": 0.8077852690497392, + "grad_norm": 1.409420371055603, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8775045275688171, + "num_tokens": 242122516.0, + "step": 6350 + }, + { + "epoch": 0.8079124793283298, + "grad_norm": 1.3637988567352295, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8820986747741699, + "num_tokens": 242162990.0, + "step": 6351 + }, + { + "epoch": 0.8080396896069202, + "grad_norm": 1.546168565750122, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8610772490501404, + "num_tokens": 242200635.0, + "step": 6352 + }, + { + "epoch": 0.8081668998855107, + "grad_norm": 1.4883309602737427, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8675140142440796, + "num_tokens": 242238005.0, + "step": 6353 + }, + { + "epoch": 0.8082941101641012, + "grad_norm": 1.4063003063201904, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8573299646377563, + "num_tokens": 242281314.0, + "step": 6354 + }, + { + "epoch": 0.8084213204426918, + "grad_norm": 1.6772843599319458, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.861971914768219, + "num_tokens": 242314848.0, + "step": 6355 + }, + { + "epoch": 0.8085485307212823, + "grad_norm": 1.5037740468978882, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8548359274864197, + "num_tokens": 242356304.0, + "step": 6356 + }, + { + "epoch": 0.8086757409998728, + "grad_norm": 1.466949224472046, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8674870729446411, + "num_tokens": 242395659.0, + "step": 6357 + }, + { + "epoch": 0.8088029512784632, + "grad_norm": 1.4709240198135376, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8757917881011963, + "num_tokens": 242433103.0, + "step": 6358 + }, + { + "epoch": 0.8089301615570538, + "grad_norm": 1.5873082876205444, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8573182225227356, + "num_tokens": 242470626.0, + "step": 6359 + }, + { + "epoch": 0.8090573718356443, + "grad_norm": 1.502269983291626, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8620960712432861, + "num_tokens": 242507768.0, + "step": 6360 + }, + { + "epoch": 0.8091845821142348, + "grad_norm": 1.528543472290039, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8727713823318481, + "num_tokens": 242544777.0, + "step": 6361 + }, + { + "epoch": 0.8093117923928254, + "grad_norm": 1.5235464572906494, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8563914895057678, + "num_tokens": 242585112.0, + "step": 6362 + }, + { + "epoch": 0.8094390026714159, + "grad_norm": 1.39605712890625, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.854056715965271, + "num_tokens": 242630002.0, + "step": 6363 + }, + { + "epoch": 0.8095662129500064, + "grad_norm": 1.6296806335449219, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8661094903945923, + "num_tokens": 242666006.0, + "step": 6364 + }, + { + "epoch": 0.8096934232285968, + "grad_norm": 1.3827106952667236, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8696099519729614, + "num_tokens": 242706094.0, + "step": 6365 + }, + { + "epoch": 0.8098206335071874, + "grad_norm": 1.4946625232696533, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8606083393096924, + "num_tokens": 242746412.0, + "step": 6366 + }, + { + "epoch": 0.8099478437857779, + "grad_norm": 1.5709589719772339, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8567771911621094, + "num_tokens": 242781948.0, + "step": 6367 + }, + { + "epoch": 0.8100750540643684, + "grad_norm": 1.5136964321136475, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8661823272705078, + "num_tokens": 242819919.0, + "step": 6368 + }, + { + "epoch": 0.8102022643429589, + "grad_norm": 1.7875938415527344, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8642826080322266, + "num_tokens": 242850782.0, + "step": 6369 + }, + { + "epoch": 0.8103294746215495, + "grad_norm": 1.5309462547302246, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8792715072631836, + "num_tokens": 242883490.0, + "step": 6370 + }, + { + "epoch": 0.8104566849001399, + "grad_norm": 1.5422441959381104, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8802322149276733, + "num_tokens": 242916154.0, + "step": 6371 + }, + { + "epoch": 0.8105838951787304, + "grad_norm": 1.4665846824645996, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8792332410812378, + "num_tokens": 242954872.0, + "step": 6372 + }, + { + "epoch": 0.810711105457321, + "grad_norm": 1.5510808229446411, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8638010025024414, + "num_tokens": 242990351.0, + "step": 6373 + }, + { + "epoch": 0.8108383157359115, + "grad_norm": 1.4826618432998657, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8749764561653137, + "num_tokens": 243028801.0, + "step": 6374 + }, + { + "epoch": 0.810965526014502, + "grad_norm": 1.5624068975448608, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8597110509872437, + "num_tokens": 243064349.0, + "step": 6375 + }, + { + "epoch": 0.8110927362930925, + "grad_norm": 1.5255476236343384, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8633739352226257, + "num_tokens": 243103792.0, + "step": 6376 + }, + { + "epoch": 0.811219946571683, + "grad_norm": 1.569278359413147, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.842194676399231, + "num_tokens": 243144047.0, + "step": 6377 + }, + { + "epoch": 0.8113471568502735, + "grad_norm": 1.3736343383789062, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8603389859199524, + "num_tokens": 243184251.0, + "step": 6378 + }, + { + "epoch": 0.811474367128864, + "grad_norm": 1.4864821434020996, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8648453950881958, + "num_tokens": 243222851.0, + "step": 6379 + }, + { + "epoch": 0.8116015774074545, + "grad_norm": 1.523072361946106, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8616271018981934, + "num_tokens": 243260330.0, + "step": 6380 + }, + { + "epoch": 0.8117287876860451, + "grad_norm": 1.553334355354309, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8818824291229248, + "num_tokens": 243292147.0, + "step": 6381 + }, + { + "epoch": 0.8118559979646356, + "grad_norm": 1.4693511724472046, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8788537979125977, + "num_tokens": 243328800.0, + "step": 6382 + }, + { + "epoch": 0.811983208243226, + "grad_norm": 1.6008315086364746, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8508701920509338, + "num_tokens": 243365825.0, + "step": 6383 + }, + { + "epoch": 0.8121104185218165, + "grad_norm": 1.3965941667556763, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8751429319381714, + "num_tokens": 243405652.0, + "step": 6384 + }, + { + "epoch": 0.8122376288004071, + "grad_norm": 1.4162073135375977, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8714870810508728, + "num_tokens": 243448855.0, + "step": 6385 + }, + { + "epoch": 0.8123648390789976, + "grad_norm": 1.49696946144104, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8677579760551453, + "num_tokens": 243485019.0, + "step": 6386 + }, + { + "epoch": 0.8124920493575881, + "grad_norm": 1.462733507156372, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8774351477622986, + "num_tokens": 243521145.0, + "step": 6387 + }, + { + "epoch": 0.8126192596361786, + "grad_norm": 1.362069845199585, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8675645589828491, + "num_tokens": 243564001.0, + "step": 6388 + }, + { + "epoch": 0.8127464699147691, + "grad_norm": 1.4091222286224365, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.880494236946106, + "num_tokens": 243600235.0, + "step": 6389 + }, + { + "epoch": 0.8128736801933596, + "grad_norm": 1.5180885791778564, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8660977482795715, + "num_tokens": 243635871.0, + "step": 6390 + }, + { + "epoch": 0.8130008904719501, + "grad_norm": 1.477084755897522, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8704100251197815, + "num_tokens": 243674217.0, + "step": 6391 + }, + { + "epoch": 0.8131281007505406, + "grad_norm": 1.5267200469970703, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8559653759002686, + "num_tokens": 243712657.0, + "step": 6392 + }, + { + "epoch": 0.8132553110291312, + "grad_norm": 1.3713749647140503, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8703048229217529, + "num_tokens": 243754180.0, + "step": 6393 + }, + { + "epoch": 0.8133825213077217, + "grad_norm": 1.4684687852859497, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.869330883026123, + "num_tokens": 243791295.0, + "step": 6394 + }, + { + "epoch": 0.8135097315863121, + "grad_norm": 1.6976943016052246, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8597249984741211, + "num_tokens": 243824934.0, + "step": 6395 + }, + { + "epoch": 0.8136369418649027, + "grad_norm": 1.4696868658065796, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8605006337165833, + "num_tokens": 243863823.0, + "step": 6396 + }, + { + "epoch": 0.8137641521434932, + "grad_norm": 1.5262444019317627, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8718724846839905, + "num_tokens": 243901754.0, + "step": 6397 + }, + { + "epoch": 0.8138913624220837, + "grad_norm": 1.593865990638733, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8684336543083191, + "num_tokens": 243934751.0, + "step": 6398 + }, + { + "epoch": 0.8140185727006742, + "grad_norm": 1.4747811555862427, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8807996511459351, + "num_tokens": 243970045.0, + "step": 6399 + }, + { + "epoch": 0.8141457829792648, + "grad_norm": 1.5023047924041748, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8722285628318787, + "num_tokens": 244006612.0, + "step": 6400 + }, + { + "epoch": 0.8142729932578552, + "grad_norm": 1.4610700607299805, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8631163835525513, + "num_tokens": 244046572.0, + "step": 6401 + }, + { + "epoch": 0.8144002035364457, + "grad_norm": 1.5416334867477417, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8762502074241638, + "num_tokens": 244081859.0, + "step": 6402 + }, + { + "epoch": 0.8145274138150362, + "grad_norm": 1.5826430320739746, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8662567138671875, + "num_tokens": 244114209.0, + "step": 6403 + }, + { + "epoch": 0.8146546240936268, + "grad_norm": 1.4353162050247192, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8626635670661926, + "num_tokens": 244155059.0, + "step": 6404 + }, + { + "epoch": 0.8147818343722173, + "grad_norm": 1.5899279117584229, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8714421987533569, + "num_tokens": 244193798.0, + "step": 6405 + }, + { + "epoch": 0.8149090446508078, + "grad_norm": 1.6196649074554443, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8593063354492188, + "num_tokens": 244229542.0, + "step": 6406 + }, + { + "epoch": 0.8150362549293982, + "grad_norm": 1.543632984161377, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8637452125549316, + "num_tokens": 244266927.0, + "step": 6407 + }, + { + "epoch": 0.8151634652079888, + "grad_norm": 1.5993421077728271, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8715229630470276, + "num_tokens": 244306097.0, + "step": 6408 + }, + { + "epoch": 0.8152906754865793, + "grad_norm": 1.4377856254577637, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8695276975631714, + "num_tokens": 244345748.0, + "step": 6409 + }, + { + "epoch": 0.8154178857651698, + "grad_norm": 1.5406333208084106, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8524583578109741, + "num_tokens": 244379956.0, + "step": 6410 + }, + { + "epoch": 0.8155450960437604, + "grad_norm": 1.6288617849349976, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8572150468826294, + "num_tokens": 244418709.0, + "step": 6411 + }, + { + "epoch": 0.8156723063223509, + "grad_norm": 1.538196325302124, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.865784764289856, + "num_tokens": 244455664.0, + "step": 6412 + }, + { + "epoch": 0.8157995166009414, + "grad_norm": 1.483398199081421, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8624575138092041, + "num_tokens": 244493249.0, + "step": 6413 + }, + { + "epoch": 0.8159267268795318, + "grad_norm": 1.603000283241272, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8675584197044373, + "num_tokens": 244530653.0, + "step": 6414 + }, + { + "epoch": 0.8160539371581224, + "grad_norm": 1.3548442125320435, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8786923885345459, + "num_tokens": 244575190.0, + "step": 6415 + }, + { + "epoch": 0.8161811474367129, + "grad_norm": 1.3683308362960815, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8668492436408997, + "num_tokens": 244618765.0, + "step": 6416 + }, + { + "epoch": 0.8163083577153034, + "grad_norm": 1.3364319801330566, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8680008053779602, + "num_tokens": 244664122.0, + "step": 6417 + }, + { + "epoch": 0.8164355679938939, + "grad_norm": 1.5549319982528687, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8670940399169922, + "num_tokens": 244700359.0, + "step": 6418 + }, + { + "epoch": 0.8165627782724845, + "grad_norm": 1.477998971939087, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8602132797241211, + "num_tokens": 244741801.0, + "step": 6419 + }, + { + "epoch": 0.8166899885510749, + "grad_norm": 1.382681965827942, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8826242089271545, + "num_tokens": 244781675.0, + "step": 6420 + }, + { + "epoch": 0.8168171988296654, + "grad_norm": 1.5882587432861328, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8643829822540283, + "num_tokens": 244818633.0, + "step": 6421 + }, + { + "epoch": 0.8169444091082559, + "grad_norm": 1.4677536487579346, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8731005191802979, + "num_tokens": 244855135.0, + "step": 6422 + }, + { + "epoch": 0.8170716193868465, + "grad_norm": 1.4770299196243286, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8715322017669678, + "num_tokens": 244890128.0, + "step": 6423 + }, + { + "epoch": 0.817198829665437, + "grad_norm": 1.4540983438491821, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8599362373352051, + "num_tokens": 244928678.0, + "step": 6424 + }, + { + "epoch": 0.8173260399440275, + "grad_norm": 1.5053836107254028, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8535844683647156, + "num_tokens": 244967095.0, + "step": 6425 + }, + { + "epoch": 0.8174532502226179, + "grad_norm": 1.4179975986480713, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8708864450454712, + "num_tokens": 245006188.0, + "step": 6426 + }, + { + "epoch": 0.8175804605012085, + "grad_norm": 1.4759310483932495, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8753818273544312, + "num_tokens": 245041381.0, + "step": 6427 + }, + { + "epoch": 0.817707670779799, + "grad_norm": 1.6279933452606201, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8745686411857605, + "num_tokens": 245073494.0, + "step": 6428 + }, + { + "epoch": 0.8178348810583895, + "grad_norm": 1.4039771556854248, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8603705763816833, + "num_tokens": 245115546.0, + "step": 6429 + }, + { + "epoch": 0.8179620913369801, + "grad_norm": 1.429855465888977, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8754258751869202, + "num_tokens": 245152203.0, + "step": 6430 + }, + { + "epoch": 0.8180893016155706, + "grad_norm": 1.748268485069275, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8467257022857666, + "num_tokens": 245182800.0, + "step": 6431 + }, + { + "epoch": 0.818216511894161, + "grad_norm": 1.5337145328521729, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8708269596099854, + "num_tokens": 245218278.0, + "step": 6432 + }, + { + "epoch": 0.8183437221727515, + "grad_norm": 1.4132264852523804, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8723486065864563, + "num_tokens": 245262750.0, + "step": 6433 + }, + { + "epoch": 0.8184709324513421, + "grad_norm": 1.4506518840789795, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8778224587440491, + "num_tokens": 245297539.0, + "step": 6434 + }, + { + "epoch": 0.8185981427299326, + "grad_norm": 1.4253836870193481, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8728776574134827, + "num_tokens": 245334988.0, + "step": 6435 + }, + { + "epoch": 0.8187253530085231, + "grad_norm": 1.472495198249817, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8745096325874329, + "num_tokens": 245372956.0, + "step": 6436 + }, + { + "epoch": 0.8188525632871136, + "grad_norm": 1.749585747718811, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8541160821914673, + "num_tokens": 245403094.0, + "step": 6437 + }, + { + "epoch": 0.8189797735657041, + "grad_norm": 1.5971113443374634, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8714906573295593, + "num_tokens": 245435649.0, + "step": 6438 + }, + { + "epoch": 0.8191069838442946, + "grad_norm": 1.2977104187011719, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8744473457336426, + "num_tokens": 245479428.0, + "step": 6439 + }, + { + "epoch": 0.8192341941228851, + "grad_norm": 1.443351149559021, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8572590351104736, + "num_tokens": 245520552.0, + "step": 6440 + }, + { + "epoch": 0.8193614044014756, + "grad_norm": 1.4989712238311768, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8653558492660522, + "num_tokens": 245556979.0, + "step": 6441 + }, + { + "epoch": 0.8194886146800662, + "grad_norm": 1.4616622924804688, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.877522885799408, + "num_tokens": 245593617.0, + "step": 6442 + }, + { + "epoch": 0.8196158249586567, + "grad_norm": 1.501062035560608, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8742162585258484, + "num_tokens": 245627481.0, + "step": 6443 + }, + { + "epoch": 0.8197430352372471, + "grad_norm": 1.57756769657135, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8524578809738159, + "num_tokens": 245664255.0, + "step": 6444 + }, + { + "epoch": 0.8198702455158376, + "grad_norm": 1.4765942096710205, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8679388761520386, + "num_tokens": 245704453.0, + "step": 6445 + }, + { + "epoch": 0.8199974557944282, + "grad_norm": 1.579053282737732, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8547122478485107, + "num_tokens": 245740841.0, + "step": 6446 + }, + { + "epoch": 0.8201246660730187, + "grad_norm": 1.3728662729263306, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8714818954467773, + "num_tokens": 245779187.0, + "step": 6447 + }, + { + "epoch": 0.8202518763516092, + "grad_norm": 1.3924018144607544, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8824679851531982, + "num_tokens": 245821732.0, + "step": 6448 + }, + { + "epoch": 0.8203790866301998, + "grad_norm": 1.4551995992660522, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8726279735565186, + "num_tokens": 245859349.0, + "step": 6449 + }, + { + "epoch": 0.8205062969087902, + "grad_norm": 1.4397913217544556, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8572705984115601, + "num_tokens": 245898974.0, + "step": 6450 + }, + { + "epoch": 0.8206335071873807, + "grad_norm": 1.4869524240493774, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.866408109664917, + "num_tokens": 245935410.0, + "step": 6451 + }, + { + "epoch": 0.8207607174659712, + "grad_norm": 1.4491453170776367, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8810685276985168, + "num_tokens": 245971685.0, + "step": 6452 + }, + { + "epoch": 0.8208879277445618, + "grad_norm": 1.7415562868118286, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.858004093170166, + "num_tokens": 246004714.0, + "step": 6453 + }, + { + "epoch": 0.8210151380231523, + "grad_norm": 1.5096768140792847, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8536958694458008, + "num_tokens": 246046539.0, + "step": 6454 + }, + { + "epoch": 0.8211423483017428, + "grad_norm": 1.5349135398864746, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8563531637191772, + "num_tokens": 246086563.0, + "step": 6455 + }, + { + "epoch": 0.8212695585803332, + "grad_norm": 1.3962457180023193, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8798434138298035, + "num_tokens": 246125206.0, + "step": 6456 + }, + { + "epoch": 0.8213967688589238, + "grad_norm": 1.5908033847808838, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8627427816390991, + "num_tokens": 246162175.0, + "step": 6457 + }, + { + "epoch": 0.8215239791375143, + "grad_norm": 1.3951847553253174, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8775210380554199, + "num_tokens": 246200827.0, + "step": 6458 + }, + { + "epoch": 0.8216511894161048, + "grad_norm": 1.5345797538757324, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8705399036407471, + "num_tokens": 246238127.0, + "step": 6459 + }, + { + "epoch": 0.8217783996946953, + "grad_norm": 1.342233419418335, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.878807544708252, + "num_tokens": 246279477.0, + "step": 6460 + }, + { + "epoch": 0.8219056099732859, + "grad_norm": 1.4069998264312744, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.868363618850708, + "num_tokens": 246316932.0, + "step": 6461 + }, + { + "epoch": 0.8220328202518764, + "grad_norm": 1.4473764896392822, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8568426370620728, + "num_tokens": 246361324.0, + "step": 6462 + }, + { + "epoch": 0.8221600305304668, + "grad_norm": 1.4943987131118774, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.860650897026062, + "num_tokens": 246402666.0, + "step": 6463 + }, + { + "epoch": 0.8222872408090574, + "grad_norm": 1.5092710256576538, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8796569108963013, + "num_tokens": 246436163.0, + "step": 6464 + }, + { + "epoch": 0.8224144510876479, + "grad_norm": 1.401045560836792, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8731890320777893, + "num_tokens": 246476406.0, + "step": 6465 + }, + { + "epoch": 0.8225416613662384, + "grad_norm": 1.596092939376831, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8522706031799316, + "num_tokens": 246512570.0, + "step": 6466 + }, + { + "epoch": 0.8226688716448289, + "grad_norm": 1.420823574066162, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8602162003517151, + "num_tokens": 246553347.0, + "step": 6467 + }, + { + "epoch": 0.8227960819234195, + "grad_norm": 1.4106853008270264, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8718340992927551, + "num_tokens": 246590996.0, + "step": 6468 + }, + { + "epoch": 0.8229232922020099, + "grad_norm": 1.3940397500991821, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8793789148330688, + "num_tokens": 246628298.0, + "step": 6469 + }, + { + "epoch": 0.8230505024806004, + "grad_norm": 1.4759495258331299, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.870990514755249, + "num_tokens": 246664475.0, + "step": 6470 + }, + { + "epoch": 0.8231777127591909, + "grad_norm": 1.4522243738174438, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8545342683792114, + "num_tokens": 246707871.0, + "step": 6471 + }, + { + "epoch": 0.8233049230377815, + "grad_norm": 1.3549550771713257, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8758682608604431, + "num_tokens": 246748385.0, + "step": 6472 + }, + { + "epoch": 0.823432133316372, + "grad_norm": 1.5015482902526855, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8687633275985718, + "num_tokens": 246785798.0, + "step": 6473 + }, + { + "epoch": 0.8235593435949625, + "grad_norm": 1.6578335762023926, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8609521985054016, + "num_tokens": 246815634.0, + "step": 6474 + }, + { + "epoch": 0.8236865538735529, + "grad_norm": 1.492379903793335, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8716344833374023, + "num_tokens": 246849490.0, + "step": 6475 + }, + { + "epoch": 0.8238137641521435, + "grad_norm": 1.544725775718689, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8726247549057007, + "num_tokens": 246886705.0, + "step": 6476 + }, + { + "epoch": 0.823940974430734, + "grad_norm": 1.569427490234375, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8617531061172485, + "num_tokens": 246919024.0, + "step": 6477 + }, + { + "epoch": 0.8240681847093245, + "grad_norm": 1.5718413591384888, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8742550015449524, + "num_tokens": 246951954.0, + "step": 6478 + }, + { + "epoch": 0.824195394987915, + "grad_norm": 1.4772976636886597, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8634318113327026, + "num_tokens": 246992198.0, + "step": 6479 + }, + { + "epoch": 0.8243226052665056, + "grad_norm": 1.375059962272644, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8839312791824341, + "num_tokens": 247033461.0, + "step": 6480 + }, + { + "epoch": 0.824449815545096, + "grad_norm": 1.3702155351638794, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8673569560050964, + "num_tokens": 247076328.0, + "step": 6481 + }, + { + "epoch": 0.8245770258236865, + "grad_norm": 1.4852384328842163, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8611383438110352, + "num_tokens": 247113773.0, + "step": 6482 + }, + { + "epoch": 0.824704236102277, + "grad_norm": 1.4080637693405151, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.869218111038208, + "num_tokens": 247151086.0, + "step": 6483 + }, + { + "epoch": 0.8248314463808676, + "grad_norm": 1.6337363719940186, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.854358971118927, + "num_tokens": 247185554.0, + "step": 6484 + }, + { + "epoch": 0.8249586566594581, + "grad_norm": 1.7233772277832031, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.842530369758606, + "num_tokens": 247222630.0, + "step": 6485 + }, + { + "epoch": 0.8250858669380486, + "grad_norm": 1.5580662488937378, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8857917189598083, + "num_tokens": 247260705.0, + "step": 6486 + }, + { + "epoch": 0.8252130772166391, + "grad_norm": 1.470085859298706, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8784332275390625, + "num_tokens": 247297807.0, + "step": 6487 + }, + { + "epoch": 0.8253402874952296, + "grad_norm": 1.5775405168533325, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8734938502311707, + "num_tokens": 247331284.0, + "step": 6488 + }, + { + "epoch": 0.8254674977738201, + "grad_norm": 1.4063191413879395, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8669296503067017, + "num_tokens": 247373212.0, + "step": 6489 + }, + { + "epoch": 0.8255947080524106, + "grad_norm": 1.4162217378616333, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8665979504585266, + "num_tokens": 247412133.0, + "step": 6490 + }, + { + "epoch": 0.8257219183310012, + "grad_norm": 1.2942578792572021, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8752802014350891, + "num_tokens": 247456698.0, + "step": 6491 + }, + { + "epoch": 0.8258491286095917, + "grad_norm": 1.4085233211517334, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8693520426750183, + "num_tokens": 247496876.0, + "step": 6492 + }, + { + "epoch": 0.8259763388881821, + "grad_norm": 1.376285433769226, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8882191777229309, + "num_tokens": 247530681.0, + "step": 6493 + }, + { + "epoch": 0.8261035491667726, + "grad_norm": 1.432464599609375, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8717210292816162, + "num_tokens": 247568409.0, + "step": 6494 + }, + { + "epoch": 0.8262307594453632, + "grad_norm": 1.4485266208648682, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8782374262809753, + "num_tokens": 247607972.0, + "step": 6495 + }, + { + "epoch": 0.8263579697239537, + "grad_norm": 1.4790854454040527, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8628413677215576, + "num_tokens": 247645118.0, + "step": 6496 + }, + { + "epoch": 0.8264851800025442, + "grad_norm": 1.4751629829406738, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.872740626335144, + "num_tokens": 247680267.0, + "step": 6497 + }, + { + "epoch": 0.8266123902811348, + "grad_norm": 1.5025182962417603, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.860931396484375, + "num_tokens": 247719219.0, + "step": 6498 + }, + { + "epoch": 0.8267396005597252, + "grad_norm": 1.3675477504730225, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8764351010322571, + "num_tokens": 247756860.0, + "step": 6499 + }, + { + "epoch": 0.8268668108383157, + "grad_norm": 1.415677785873413, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8651421070098877, + "num_tokens": 247797087.0, + "step": 6500 + }, + { + "epoch": 0.8269940211169062, + "grad_norm": 1.6304960250854492, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8614470362663269, + "num_tokens": 247830476.0, + "step": 6501 + }, + { + "epoch": 0.8271212313954968, + "grad_norm": 1.4182876348495483, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8764055967330933, + "num_tokens": 247870621.0, + "step": 6502 + }, + { + "epoch": 0.8272484416740873, + "grad_norm": 1.4470791816711426, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.863114595413208, + "num_tokens": 247907259.0, + "step": 6503 + }, + { + "epoch": 0.8273756519526778, + "grad_norm": 1.4426652193069458, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8590728044509888, + "num_tokens": 247950559.0, + "step": 6504 + }, + { + "epoch": 0.8275028622312682, + "grad_norm": 1.414542317390442, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.874396562576294, + "num_tokens": 247989563.0, + "step": 6505 + }, + { + "epoch": 0.8276300725098588, + "grad_norm": 1.3543754816055298, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8620505928993225, + "num_tokens": 248032386.0, + "step": 6506 + }, + { + "epoch": 0.8277572827884493, + "grad_norm": 1.6339370012283325, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8548353910446167, + "num_tokens": 248072395.0, + "step": 6507 + }, + { + "epoch": 0.8278844930670398, + "grad_norm": 1.5069047212600708, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8625260591506958, + "num_tokens": 248110654.0, + "step": 6508 + }, + { + "epoch": 0.8280117033456303, + "grad_norm": 1.5892804861068726, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8757017254829407, + "num_tokens": 248143902.0, + "step": 6509 + }, + { + "epoch": 0.8281389136242209, + "grad_norm": 1.3608555793762207, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8843545913696289, + "num_tokens": 248180605.0, + "step": 6510 + }, + { + "epoch": 0.8282661239028114, + "grad_norm": 1.466799020767212, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8737925887107849, + "num_tokens": 248218378.0, + "step": 6511 + }, + { + "epoch": 0.8283933341814018, + "grad_norm": 1.4590190649032593, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8633724451065063, + "num_tokens": 248257652.0, + "step": 6512 + }, + { + "epoch": 0.8285205444599923, + "grad_norm": 1.4678432941436768, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.866969645023346, + "num_tokens": 248296927.0, + "step": 6513 + }, + { + "epoch": 0.8286477547385829, + "grad_norm": 1.5467993021011353, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8527169227600098, + "num_tokens": 248335018.0, + "step": 6514 + }, + { + "epoch": 0.8287749650171734, + "grad_norm": 1.4476226568222046, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.864487886428833, + "num_tokens": 248374837.0, + "step": 6515 + }, + { + "epoch": 0.8289021752957639, + "grad_norm": 1.4549808502197266, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8490861654281616, + "num_tokens": 248418754.0, + "step": 6516 + }, + { + "epoch": 0.8290293855743545, + "grad_norm": 1.4730297327041626, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.876144289970398, + "num_tokens": 248452252.0, + "step": 6517 + }, + { + "epoch": 0.8291565958529449, + "grad_norm": 1.3928561210632324, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8717032670974731, + "num_tokens": 248492224.0, + "step": 6518 + }, + { + "epoch": 0.8292838061315354, + "grad_norm": 1.4863975048065186, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8610802292823792, + "num_tokens": 248529504.0, + "step": 6519 + }, + { + "epoch": 0.8294110164101259, + "grad_norm": 1.6268200874328613, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8584269881248474, + "num_tokens": 248561008.0, + "step": 6520 + }, + { + "epoch": 0.8295382266887165, + "grad_norm": 1.4849376678466797, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8758671283721924, + "num_tokens": 248598331.0, + "step": 6521 + }, + { + "epoch": 0.829665436967307, + "grad_norm": 1.4261263608932495, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8640267252922058, + "num_tokens": 248641266.0, + "step": 6522 + }, + { + "epoch": 0.8297926472458975, + "grad_norm": 1.4723984003067017, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8859859704971313, + "num_tokens": 248675036.0, + "step": 6523 + }, + { + "epoch": 0.8299198575244879, + "grad_norm": 1.531825304031372, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8782068490982056, + "num_tokens": 248708524.0, + "step": 6524 + }, + { + "epoch": 0.8300470678030785, + "grad_norm": 1.5123553276062012, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8615663647651672, + "num_tokens": 248744808.0, + "step": 6525 + }, + { + "epoch": 0.830174278081669, + "grad_norm": 1.4746713638305664, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8686107397079468, + "num_tokens": 248782876.0, + "step": 6526 + }, + { + "epoch": 0.8303014883602595, + "grad_norm": 1.4735350608825684, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.868882954120636, + "num_tokens": 248818269.0, + "step": 6527 + }, + { + "epoch": 0.83042869863885, + "grad_norm": 1.4332399368286133, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8814812302589417, + "num_tokens": 248855566.0, + "step": 6528 + }, + { + "epoch": 0.8305559089174406, + "grad_norm": 1.4050357341766357, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8567715287208557, + "num_tokens": 248902169.0, + "step": 6529 + }, + { + "epoch": 0.830683119196031, + "grad_norm": 1.473533034324646, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8755661249160767, + "num_tokens": 248940061.0, + "step": 6530 + }, + { + "epoch": 0.8308103294746215, + "grad_norm": 1.5248104333877563, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8612542748451233, + "num_tokens": 248979142.0, + "step": 6531 + }, + { + "epoch": 0.830937539753212, + "grad_norm": 1.5020685195922852, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8559064865112305, + "num_tokens": 249015385.0, + "step": 6532 + }, + { + "epoch": 0.8310647500318026, + "grad_norm": 1.5214859247207642, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8645457029342651, + "num_tokens": 249052948.0, + "step": 6533 + }, + { + "epoch": 0.8311919603103931, + "grad_norm": 1.4566744565963745, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8727580308914185, + "num_tokens": 249089048.0, + "step": 6534 + }, + { + "epoch": 0.8313191705889836, + "grad_norm": 1.4025704860687256, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.861781656742096, + "num_tokens": 249130086.0, + "step": 6535 + }, + { + "epoch": 0.831446380867574, + "grad_norm": 1.331688642501831, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8742607831954956, + "num_tokens": 249171763.0, + "step": 6536 + }, + { + "epoch": 0.8315735911461646, + "grad_norm": 1.4665542840957642, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8570466041564941, + "num_tokens": 249211569.0, + "step": 6537 + }, + { + "epoch": 0.8317008014247551, + "grad_norm": 1.5233525037765503, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8875843286514282, + "num_tokens": 249244620.0, + "step": 6538 + }, + { + "epoch": 0.8318280117033456, + "grad_norm": 1.4388389587402344, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.849711537361145, + "num_tokens": 249290288.0, + "step": 6539 + }, + { + "epoch": 0.8319552219819362, + "grad_norm": 1.5218786001205444, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8662288188934326, + "num_tokens": 249327659.0, + "step": 6540 + }, + { + "epoch": 0.8320824322605267, + "grad_norm": 1.4532976150512695, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.860304057598114, + "num_tokens": 249369557.0, + "step": 6541 + }, + { + "epoch": 0.8322096425391171, + "grad_norm": 1.5138311386108398, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8713926076889038, + "num_tokens": 249406287.0, + "step": 6542 + }, + { + "epoch": 0.8323368528177076, + "grad_norm": 1.301163673400879, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8816916942596436, + "num_tokens": 249449005.0, + "step": 6543 + }, + { + "epoch": 0.8324640630962982, + "grad_norm": 1.4200934171676636, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8557119369506836, + "num_tokens": 249495051.0, + "step": 6544 + }, + { + "epoch": 0.8325912733748887, + "grad_norm": 1.4577651023864746, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8790585398674011, + "num_tokens": 249531894.0, + "step": 6545 + }, + { + "epoch": 0.8327184836534792, + "grad_norm": 1.3271628618240356, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.875799298286438, + "num_tokens": 249575337.0, + "step": 6546 + }, + { + "epoch": 0.8328456939320698, + "grad_norm": 1.4325730800628662, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8716734647750854, + "num_tokens": 249614505.0, + "step": 6547 + }, + { + "epoch": 0.8329729042106602, + "grad_norm": 1.562206745147705, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8749932050704956, + "num_tokens": 249647057.0, + "step": 6548 + }, + { + "epoch": 0.8331001144892507, + "grad_norm": 1.4497343301773071, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8605027198791504, + "num_tokens": 249686421.0, + "step": 6549 + }, + { + "epoch": 0.8332273247678412, + "grad_norm": 1.5391216278076172, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8745756149291992, + "num_tokens": 249723486.0, + "step": 6550 + }, + { + "epoch": 0.8333545350464318, + "grad_norm": 1.5171833038330078, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8562525510787964, + "num_tokens": 249762340.0, + "step": 6551 + }, + { + "epoch": 0.8334817453250223, + "grad_norm": 1.4129910469055176, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8567725419998169, + "num_tokens": 249806925.0, + "step": 6552 + }, + { + "epoch": 0.8336089556036128, + "grad_norm": 1.5767731666564941, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8632017970085144, + "num_tokens": 249843177.0, + "step": 6553 + }, + { + "epoch": 0.8337361658822032, + "grad_norm": 1.421565055847168, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.874480128288269, + "num_tokens": 249882853.0, + "step": 6554 + }, + { + "epoch": 0.8338633761607938, + "grad_norm": 1.433885097503662, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8724806308746338, + "num_tokens": 249920781.0, + "step": 6555 + }, + { + "epoch": 0.8339905864393843, + "grad_norm": 1.515844702720642, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8677978515625, + "num_tokens": 249956403.0, + "step": 6556 + }, + { + "epoch": 0.8341177967179748, + "grad_norm": 1.5519859790802002, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8873602151870728, + "num_tokens": 249989822.0, + "step": 6557 + }, + { + "epoch": 0.8342450069965653, + "grad_norm": 1.4794355630874634, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8818763494491577, + "num_tokens": 250023370.0, + "step": 6558 + }, + { + "epoch": 0.8343722172751559, + "grad_norm": 1.465918779373169, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8588981032371521, + "num_tokens": 250062870.0, + "step": 6559 + }, + { + "epoch": 0.8344994275537464, + "grad_norm": 1.3476542234420776, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8739095330238342, + "num_tokens": 250103970.0, + "step": 6560 + }, + { + "epoch": 0.8346266378323368, + "grad_norm": 1.5091503858566284, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8769149780273438, + "num_tokens": 250137271.0, + "step": 6561 + }, + { + "epoch": 0.8347538481109273, + "grad_norm": 1.405917763710022, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8647733926773071, + "num_tokens": 250183116.0, + "step": 6562 + }, + { + "epoch": 0.8348810583895179, + "grad_norm": 1.3952500820159912, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.875190258026123, + "num_tokens": 250224574.0, + "step": 6563 + }, + { + "epoch": 0.8350082686681084, + "grad_norm": 1.529516577720642, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8526557087898254, + "num_tokens": 250265450.0, + "step": 6564 + }, + { + "epoch": 0.8351354789466989, + "grad_norm": 1.5151176452636719, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8708505630493164, + "num_tokens": 250301445.0, + "step": 6565 + }, + { + "epoch": 0.8352626892252895, + "grad_norm": 1.6187978982925415, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8715150356292725, + "num_tokens": 250334259.0, + "step": 6566 + }, + { + "epoch": 0.8353898995038799, + "grad_norm": 1.5467956066131592, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8705397844314575, + "num_tokens": 250368469.0, + "step": 6567 + }, + { + "epoch": 0.8355171097824704, + "grad_norm": 1.365144968032837, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8754945993423462, + "num_tokens": 250408905.0, + "step": 6568 + }, + { + "epoch": 0.8356443200610609, + "grad_norm": 1.5840874910354614, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8619743585586548, + "num_tokens": 250446698.0, + "step": 6569 + }, + { + "epoch": 0.8357715303396515, + "grad_norm": 1.3887977600097656, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8703864812850952, + "num_tokens": 250491184.0, + "step": 6570 + }, + { + "epoch": 0.835898740618242, + "grad_norm": 1.4239140748977661, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8730729818344116, + "num_tokens": 250529837.0, + "step": 6571 + }, + { + "epoch": 0.8360259508968325, + "grad_norm": 1.359390139579773, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8732481002807617, + "num_tokens": 250571776.0, + "step": 6572 + }, + { + "epoch": 0.8361531611754229, + "grad_norm": 1.465080976486206, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8617727756500244, + "num_tokens": 250610199.0, + "step": 6573 + }, + { + "epoch": 0.8362803714540135, + "grad_norm": 1.4538886547088623, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.870388388633728, + "num_tokens": 250650214.0, + "step": 6574 + }, + { + "epoch": 0.836407581732604, + "grad_norm": 1.5210416316986084, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8618664741516113, + "num_tokens": 250689325.0, + "step": 6575 + }, + { + "epoch": 0.8365347920111945, + "grad_norm": 1.443648099899292, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8604142069816589, + "num_tokens": 250730145.0, + "step": 6576 + }, + { + "epoch": 0.836662002289785, + "grad_norm": 1.4131529331207275, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8795074224472046, + "num_tokens": 250767340.0, + "step": 6577 + }, + { + "epoch": 0.8367892125683756, + "grad_norm": 1.5963873863220215, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8633209466934204, + "num_tokens": 250804613.0, + "step": 6578 + }, + { + "epoch": 0.836916422846966, + "grad_norm": 1.4796044826507568, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8701411485671997, + "num_tokens": 250848122.0, + "step": 6579 + }, + { + "epoch": 0.8370436331255565, + "grad_norm": 1.604777216911316, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8563024401664734, + "num_tokens": 250881602.0, + "step": 6580 + }, + { + "epoch": 0.837170843404147, + "grad_norm": 1.4153190851211548, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8697327971458435, + "num_tokens": 250920906.0, + "step": 6581 + }, + { + "epoch": 0.8372980536827376, + "grad_norm": 1.394858717918396, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8773192167282104, + "num_tokens": 250961720.0, + "step": 6582 + }, + { + "epoch": 0.8374252639613281, + "grad_norm": 1.410785436630249, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8689988255500793, + "num_tokens": 251002064.0, + "step": 6583 + }, + { + "epoch": 0.8375524742399186, + "grad_norm": 1.4938777685165405, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8761796355247498, + "num_tokens": 251039103.0, + "step": 6584 + }, + { + "epoch": 0.837679684518509, + "grad_norm": 1.5237973928451538, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8734867572784424, + "num_tokens": 251076324.0, + "step": 6585 + }, + { + "epoch": 0.8378068947970996, + "grad_norm": 1.529549241065979, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8651366829872131, + "num_tokens": 251114732.0, + "step": 6586 + }, + { + "epoch": 0.8379341050756901, + "grad_norm": 1.4794443845748901, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8710877895355225, + "num_tokens": 251152219.0, + "step": 6587 + }, + { + "epoch": 0.8380613153542806, + "grad_norm": 1.4401320219039917, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.879615306854248, + "num_tokens": 251185949.0, + "step": 6588 + }, + { + "epoch": 0.8381885256328712, + "grad_norm": 1.3916186094284058, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8825967311859131, + "num_tokens": 251223843.0, + "step": 6589 + }, + { + "epoch": 0.8383157359114617, + "grad_norm": 1.384519338607788, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.87032151222229, + "num_tokens": 251266094.0, + "step": 6590 + }, + { + "epoch": 0.8384429461900521, + "grad_norm": 1.3779296875, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8786731362342834, + "num_tokens": 251309656.0, + "step": 6591 + }, + { + "epoch": 0.8385701564686426, + "grad_norm": 1.448852300643921, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8618688583374023, + "num_tokens": 251352216.0, + "step": 6592 + }, + { + "epoch": 0.8386973667472332, + "grad_norm": 1.4237490892410278, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8777117133140564, + "num_tokens": 251391805.0, + "step": 6593 + }, + { + "epoch": 0.8388245770258237, + "grad_norm": 1.5326634645462036, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8490586280822754, + "num_tokens": 251428654.0, + "step": 6594 + }, + { + "epoch": 0.8389517873044142, + "grad_norm": 1.4924954175949097, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8555953502655029, + "num_tokens": 251470141.0, + "step": 6595 + }, + { + "epoch": 0.8390789975830047, + "grad_norm": 1.3676856756210327, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8730463981628418, + "num_tokens": 251514150.0, + "step": 6596 + }, + { + "epoch": 0.8392062078615952, + "grad_norm": 1.4848790168762207, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8665931224822998, + "num_tokens": 251552049.0, + "step": 6597 + }, + { + "epoch": 0.8393334181401857, + "grad_norm": 1.5165297985076904, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8708973526954651, + "num_tokens": 251584928.0, + "step": 6598 + }, + { + "epoch": 0.8394606284187762, + "grad_norm": 1.6930556297302246, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8562527894973755, + "num_tokens": 251617785.0, + "step": 6599 + }, + { + "epoch": 0.8395878386973668, + "grad_norm": 1.7007100582122803, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8635736703872681, + "num_tokens": 251652170.0, + "step": 6600 + }, + { + "epoch": 0.8397150489759573, + "grad_norm": 1.4821354150772095, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8729767799377441, + "num_tokens": 251689048.0, + "step": 6601 + }, + { + "epoch": 0.8398422592545478, + "grad_norm": 1.3515626192092896, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8760847449302673, + "num_tokens": 251730841.0, + "step": 6602 + }, + { + "epoch": 0.8399694695331382, + "grad_norm": 1.390113353729248, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8735486268997192, + "num_tokens": 251769262.0, + "step": 6603 + }, + { + "epoch": 0.8400966798117288, + "grad_norm": 1.5028917789459229, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8664063215255737, + "num_tokens": 251805521.0, + "step": 6604 + }, + { + "epoch": 0.8402238900903193, + "grad_norm": 1.5221805572509766, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8770143985748291, + "num_tokens": 251843519.0, + "step": 6605 + }, + { + "epoch": 0.8403511003689098, + "grad_norm": 1.48995041847229, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8463829159736633, + "num_tokens": 251888934.0, + "step": 6606 + }, + { + "epoch": 0.8404783106475003, + "grad_norm": 1.529971718788147, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8601936101913452, + "num_tokens": 251929956.0, + "step": 6607 + }, + { + "epoch": 0.8406055209260909, + "grad_norm": 1.5176459550857544, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8583320379257202, + "num_tokens": 251968643.0, + "step": 6608 + }, + { + "epoch": 0.8407327312046813, + "grad_norm": 1.3928390741348267, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8669404983520508, + "num_tokens": 252012591.0, + "step": 6609 + }, + { + "epoch": 0.8408599414832718, + "grad_norm": 1.4902117252349854, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8843947649002075, + "num_tokens": 252047966.0, + "step": 6610 + }, + { + "epoch": 0.8409871517618623, + "grad_norm": 1.362621784210205, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8806458115577698, + "num_tokens": 252088493.0, + "step": 6611 + }, + { + "epoch": 0.8411143620404529, + "grad_norm": 16.37474250793457, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8638692498207092, + "num_tokens": 252125341.0, + "step": 6612 + }, + { + "epoch": 0.8412415723190434, + "grad_norm": 1.6014931201934814, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8476961255073547, + "num_tokens": 252159047.0, + "step": 6613 + }, + { + "epoch": 0.8413687825976339, + "grad_norm": 1.5768673419952393, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8630865216255188, + "num_tokens": 252193688.0, + "step": 6614 + }, + { + "epoch": 0.8414959928762245, + "grad_norm": 1.4498379230499268, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8771801590919495, + "num_tokens": 252231221.0, + "step": 6615 + }, + { + "epoch": 0.8416232031548149, + "grad_norm": 1.517508864402771, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8664699792861938, + "num_tokens": 252269723.0, + "step": 6616 + }, + { + "epoch": 0.8417504134334054, + "grad_norm": 1.490893006324768, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8781126141548157, + "num_tokens": 252302806.0, + "step": 6617 + }, + { + "epoch": 0.8418776237119959, + "grad_norm": 1.40889310836792, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8675926923751831, + "num_tokens": 252342019.0, + "step": 6618 + }, + { + "epoch": 0.8420048339905865, + "grad_norm": 1.4099578857421875, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8710843324661255, + "num_tokens": 252383116.0, + "step": 6619 + }, + { + "epoch": 0.842132044269177, + "grad_norm": 1.576792597770691, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8667171001434326, + "num_tokens": 252418584.0, + "step": 6620 + }, + { + "epoch": 0.8422592545477675, + "grad_norm": 1.6486231088638306, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8650293350219727, + "num_tokens": 252450516.0, + "step": 6621 + }, + { + "epoch": 0.8423864648263579, + "grad_norm": 1.373260736465454, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8815418481826782, + "num_tokens": 252489838.0, + "step": 6622 + }, + { + "epoch": 0.8425136751049485, + "grad_norm": 1.5733541250228882, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8597080707550049, + "num_tokens": 252525999.0, + "step": 6623 + }, + { + "epoch": 0.842640885383539, + "grad_norm": 1.5201174020767212, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8705819845199585, + "num_tokens": 252561241.0, + "step": 6624 + }, + { + "epoch": 0.8427680956621295, + "grad_norm": 1.4883058071136475, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8700095415115356, + "num_tokens": 252600283.0, + "step": 6625 + }, + { + "epoch": 0.84289530594072, + "grad_norm": 1.5399212837219238, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8343583345413208, + "num_tokens": 252642872.0, + "step": 6626 + }, + { + "epoch": 0.8430225162193106, + "grad_norm": 1.3976191282272339, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8669442534446716, + "num_tokens": 252685897.0, + "step": 6627 + }, + { + "epoch": 0.843149726497901, + "grad_norm": 1.4480652809143066, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8656051158905029, + "num_tokens": 252723499.0, + "step": 6628 + }, + { + "epoch": 0.8432769367764915, + "grad_norm": 1.5540568828582764, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.874845564365387, + "num_tokens": 252754957.0, + "step": 6629 + }, + { + "epoch": 0.843404147055082, + "grad_norm": 1.5243216753005981, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8595198392868042, + "num_tokens": 252791309.0, + "step": 6630 + }, + { + "epoch": 0.8435313573336726, + "grad_norm": 1.4725650548934937, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8688976764678955, + "num_tokens": 252829206.0, + "step": 6631 + }, + { + "epoch": 0.8436585676122631, + "grad_norm": 1.358282446861267, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8675963878631592, + "num_tokens": 252871124.0, + "step": 6632 + }, + { + "epoch": 0.8437857778908536, + "grad_norm": 1.3829556703567505, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8840829730033875, + "num_tokens": 252907508.0, + "step": 6633 + }, + { + "epoch": 0.843912988169444, + "grad_norm": 1.3825546503067017, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8555490970611572, + "num_tokens": 252949724.0, + "step": 6634 + }, + { + "epoch": 0.8440401984480346, + "grad_norm": 1.5369374752044678, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8708877563476562, + "num_tokens": 252987850.0, + "step": 6635 + }, + { + "epoch": 0.8441674087266251, + "grad_norm": 1.4624468088150024, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8656312227249146, + "num_tokens": 253026028.0, + "step": 6636 + }, + { + "epoch": 0.8442946190052156, + "grad_norm": 1.5421308279037476, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8557167053222656, + "num_tokens": 253067712.0, + "step": 6637 + }, + { + "epoch": 0.8444218292838062, + "grad_norm": 1.5557729005813599, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8889762163162231, + "num_tokens": 253101024.0, + "step": 6638 + }, + { + "epoch": 0.8445490395623967, + "grad_norm": 1.6215825080871582, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8660796880722046, + "num_tokens": 253133544.0, + "step": 6639 + }, + { + "epoch": 0.8446762498409871, + "grad_norm": 1.3386410474777222, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8670583963394165, + "num_tokens": 253180409.0, + "step": 6640 + }, + { + "epoch": 0.8448034601195776, + "grad_norm": 1.6588135957717896, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8606387972831726, + "num_tokens": 253212657.0, + "step": 6641 + }, + { + "epoch": 0.8449306703981682, + "grad_norm": 1.5311609506607056, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8741574883460999, + "num_tokens": 253249820.0, + "step": 6642 + }, + { + "epoch": 0.8450578806767587, + "grad_norm": 1.5121463537216187, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8746665716171265, + "num_tokens": 253283728.0, + "step": 6643 + }, + { + "epoch": 0.8451850909553492, + "grad_norm": 1.458694577217102, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8621587753295898, + "num_tokens": 253327011.0, + "step": 6644 + }, + { + "epoch": 0.8453123012339397, + "grad_norm": 1.6196818351745605, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.869360089302063, + "num_tokens": 253362273.0, + "step": 6645 + }, + { + "epoch": 0.8454395115125302, + "grad_norm": 1.3817847967147827, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8709250688552856, + "num_tokens": 253402854.0, + "step": 6646 + }, + { + "epoch": 0.8455667217911207, + "grad_norm": 1.4552092552185059, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8618488311767578, + "num_tokens": 253444333.0, + "step": 6647 + }, + { + "epoch": 0.8456939320697112, + "grad_norm": 1.6354554891586304, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.880109429359436, + "num_tokens": 253485009.0, + "step": 6648 + }, + { + "epoch": 0.8458211423483017, + "grad_norm": 1.471337080001831, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.863925576210022, + "num_tokens": 253523951.0, + "step": 6649 + }, + { + "epoch": 0.8459483526268923, + "grad_norm": 1.4911904335021973, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.875146746635437, + "num_tokens": 253560441.0, + "step": 6650 + }, + { + "epoch": 0.8460755629054828, + "grad_norm": 1.4196187257766724, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8748355507850647, + "num_tokens": 253602340.0, + "step": 6651 + }, + { + "epoch": 0.8462027731840732, + "grad_norm": 1.4108388423919678, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.861072301864624, + "num_tokens": 253642008.0, + "step": 6652 + }, + { + "epoch": 0.8463299834626637, + "grad_norm": 1.6524507999420166, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.872414767742157, + "num_tokens": 253677452.0, + "step": 6653 + }, + { + "epoch": 0.8464571937412543, + "grad_norm": 1.599034070968628, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8362643122673035, + "num_tokens": 253717065.0, + "step": 6654 + }, + { + "epoch": 0.8465844040198448, + "grad_norm": 1.4705508947372437, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8667986989021301, + "num_tokens": 253757369.0, + "step": 6655 + }, + { + "epoch": 0.8467116142984353, + "grad_norm": 1.5314007997512817, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8815125226974487, + "num_tokens": 253791658.0, + "step": 6656 + }, + { + "epoch": 0.8468388245770259, + "grad_norm": 1.5803680419921875, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.86305832862854, + "num_tokens": 253825834.0, + "step": 6657 + }, + { + "epoch": 0.8469660348556163, + "grad_norm": 1.4099642038345337, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.872746467590332, + "num_tokens": 253865156.0, + "step": 6658 + }, + { + "epoch": 0.8470932451342068, + "grad_norm": 1.525543212890625, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8749910593032837, + "num_tokens": 253896638.0, + "step": 6659 + }, + { + "epoch": 0.8472204554127973, + "grad_norm": 1.5540739297866821, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8599351644515991, + "num_tokens": 253932575.0, + "step": 6660 + }, + { + "epoch": 0.8473476656913879, + "grad_norm": 1.5023959875106812, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8670927882194519, + "num_tokens": 253969307.0, + "step": 6661 + }, + { + "epoch": 0.8474748759699784, + "grad_norm": 1.4272819757461548, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8674834966659546, + "num_tokens": 254009742.0, + "step": 6662 + }, + { + "epoch": 0.8476020862485689, + "grad_norm": 1.443138837814331, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8734549283981323, + "num_tokens": 254048926.0, + "step": 6663 + }, + { + "epoch": 0.8477292965271594, + "grad_norm": 1.4962655305862427, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8671308755874634, + "num_tokens": 254084945.0, + "step": 6664 + }, + { + "epoch": 0.8478565068057499, + "grad_norm": 1.4566420316696167, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8774917721748352, + "num_tokens": 254120496.0, + "step": 6665 + }, + { + "epoch": 0.8479837170843404, + "grad_norm": 1.4784842729568481, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8624881505966187, + "num_tokens": 254157141.0, + "step": 6666 + }, + { + "epoch": 0.8481109273629309, + "grad_norm": 1.479369878768921, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8585577011108398, + "num_tokens": 254197901.0, + "step": 6667 + }, + { + "epoch": 0.8482381376415215, + "grad_norm": 1.5152692794799805, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8692289590835571, + "num_tokens": 254235900.0, + "step": 6668 + }, + { + "epoch": 0.848365347920112, + "grad_norm": 1.4511228799819946, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8675369620323181, + "num_tokens": 254275102.0, + "step": 6669 + }, + { + "epoch": 0.8484925581987025, + "grad_norm": 1.5460888147354126, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.860699474811554, + "num_tokens": 254313551.0, + "step": 6670 + }, + { + "epoch": 0.8486197684772929, + "grad_norm": 1.4398564100265503, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8586050271987915, + "num_tokens": 254353281.0, + "step": 6671 + }, + { + "epoch": 0.8487469787558835, + "grad_norm": 1.5313717126846313, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8716450929641724, + "num_tokens": 254390754.0, + "step": 6672 + }, + { + "epoch": 0.848874189034474, + "grad_norm": 1.3672019243240356, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8736960887908936, + "num_tokens": 254432708.0, + "step": 6673 + }, + { + "epoch": 0.8490013993130645, + "grad_norm": 1.6071761846542358, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8635038137435913, + "num_tokens": 254469048.0, + "step": 6674 + }, + { + "epoch": 0.849128609591655, + "grad_norm": 1.5061691999435425, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8588536977767944, + "num_tokens": 254510218.0, + "step": 6675 + }, + { + "epoch": 0.8492558198702456, + "grad_norm": 1.597366213798523, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8746981620788574, + "num_tokens": 254543486.0, + "step": 6676 + }, + { + "epoch": 0.849383030148836, + "grad_norm": 1.3746333122253418, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.870496392250061, + "num_tokens": 254583816.0, + "step": 6677 + }, + { + "epoch": 0.8495102404274265, + "grad_norm": 1.3828709125518799, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8584256172180176, + "num_tokens": 254625346.0, + "step": 6678 + }, + { + "epoch": 0.849637450706017, + "grad_norm": 1.403290867805481, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8649822473526001, + "num_tokens": 254666742.0, + "step": 6679 + }, + { + "epoch": 0.8497646609846076, + "grad_norm": 1.6569569110870361, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8627394437789917, + "num_tokens": 254698823.0, + "step": 6680 + }, + { + "epoch": 0.8498918712631981, + "grad_norm": 1.524865746498108, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8698001503944397, + "num_tokens": 254734855.0, + "step": 6681 + }, + { + "epoch": 0.8500190815417886, + "grad_norm": 1.3886052370071411, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8673421144485474, + "num_tokens": 254774552.0, + "step": 6682 + }, + { + "epoch": 0.850146291820379, + "grad_norm": 1.5664913654327393, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8698095083236694, + "num_tokens": 254810052.0, + "step": 6683 + }, + { + "epoch": 0.8502735020989696, + "grad_norm": 1.5518733263015747, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8816958665847778, + "num_tokens": 254843496.0, + "step": 6684 + }, + { + "epoch": 0.8504007123775601, + "grad_norm": 1.3775569200515747, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8847323656082153, + "num_tokens": 254881280.0, + "step": 6685 + }, + { + "epoch": 0.8505279226561506, + "grad_norm": 1.5430792570114136, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8731868863105774, + "num_tokens": 254919777.0, + "step": 6686 + }, + { + "epoch": 0.8506551329347412, + "grad_norm": 1.563204288482666, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8648263216018677, + "num_tokens": 254953132.0, + "step": 6687 + }, + { + "epoch": 0.8507823432133317, + "grad_norm": 1.4395982027053833, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8632103204727173, + "num_tokens": 254995602.0, + "step": 6688 + }, + { + "epoch": 0.8509095534919221, + "grad_norm": 1.3829951286315918, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8872162103652954, + "num_tokens": 255035275.0, + "step": 6689 + }, + { + "epoch": 0.8510367637705126, + "grad_norm": 1.652371883392334, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8445384502410889, + "num_tokens": 255068094.0, + "step": 6690 + }, + { + "epoch": 0.8511639740491032, + "grad_norm": 1.374086618423462, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8689452409744263, + "num_tokens": 255108495.0, + "step": 6691 + }, + { + "epoch": 0.8512911843276937, + "grad_norm": 1.6201913356781006, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8775109052658081, + "num_tokens": 255138652.0, + "step": 6692 + }, + { + "epoch": 0.8514183946062842, + "grad_norm": 1.6520957946777344, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8568341732025146, + "num_tokens": 255173499.0, + "step": 6693 + }, + { + "epoch": 0.8515456048848747, + "grad_norm": 1.5695880651474, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8431346416473389, + "num_tokens": 255211216.0, + "step": 6694 + }, + { + "epoch": 0.8516728151634652, + "grad_norm": 1.3948372602462769, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8668982982635498, + "num_tokens": 255253690.0, + "step": 6695 + }, + { + "epoch": 0.8518000254420557, + "grad_norm": 1.3811473846435547, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.870633602142334, + "num_tokens": 255294964.0, + "step": 6696 + }, + { + "epoch": 0.8519272357206462, + "grad_norm": 1.6422194242477417, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8482491970062256, + "num_tokens": 255328641.0, + "step": 6697 + }, + { + "epoch": 0.8520544459992367, + "grad_norm": 1.3987915515899658, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8641961812973022, + "num_tokens": 255369509.0, + "step": 6698 + }, + { + "epoch": 0.8521816562778273, + "grad_norm": 1.4349987506866455, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8712481260299683, + "num_tokens": 255407891.0, + "step": 6699 + }, + { + "epoch": 0.8523088665564178, + "grad_norm": 1.3665127754211426, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8819069862365723, + "num_tokens": 255445262.0, + "step": 6700 + }, + { + "epoch": 0.8524360768350082, + "grad_norm": 1.491532802581787, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8796721696853638, + "num_tokens": 255482140.0, + "step": 6701 + }, + { + "epoch": 0.8525632871135987, + "grad_norm": 1.3749359846115112, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8846344947814941, + "num_tokens": 255525308.0, + "step": 6702 + }, + { + "epoch": 0.8526904973921893, + "grad_norm": 1.5317760705947876, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8608626127243042, + "num_tokens": 255564126.0, + "step": 6703 + }, + { + "epoch": 0.8528177076707798, + "grad_norm": 1.4073054790496826, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8759207725524902, + "num_tokens": 255602781.0, + "step": 6704 + }, + { + "epoch": 0.8529449179493703, + "grad_norm": 1.531883955001831, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8745462894439697, + "num_tokens": 255638266.0, + "step": 6705 + }, + { + "epoch": 0.8530721282279609, + "grad_norm": 1.530401349067688, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8408933281898499, + "num_tokens": 255677480.0, + "step": 6706 + }, + { + "epoch": 0.8531993385065513, + "grad_norm": 1.4498368501663208, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8753311634063721, + "num_tokens": 255715124.0, + "step": 6707 + }, + { + "epoch": 0.8533265487851418, + "grad_norm": 1.415723204612732, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8673089742660522, + "num_tokens": 255755060.0, + "step": 6708 + }, + { + "epoch": 0.8534537590637323, + "grad_norm": 1.516669750213623, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8780878186225891, + "num_tokens": 255790251.0, + "step": 6709 + }, + { + "epoch": 0.8535809693423229, + "grad_norm": 1.37003755569458, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.854220449924469, + "num_tokens": 255835348.0, + "step": 6710 + }, + { + "epoch": 0.8537081796209134, + "grad_norm": 1.4034385681152344, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8679347038269043, + "num_tokens": 255879161.0, + "step": 6711 + }, + { + "epoch": 0.8538353898995039, + "grad_norm": 1.6254338026046753, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8671758770942688, + "num_tokens": 255912312.0, + "step": 6712 + }, + { + "epoch": 0.8539626001780944, + "grad_norm": 1.6179242134094238, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8768562078475952, + "num_tokens": 255943179.0, + "step": 6713 + }, + { + "epoch": 0.8540898104566849, + "grad_norm": 1.4281476736068726, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8699321150779724, + "num_tokens": 255980291.0, + "step": 6714 + }, + { + "epoch": 0.8542170207352754, + "grad_norm": 1.5590219497680664, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8724542856216431, + "num_tokens": 256014186.0, + "step": 6715 + }, + { + "epoch": 0.8543442310138659, + "grad_norm": 1.4673765897750854, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8641614317893982, + "num_tokens": 256050070.0, + "step": 6716 + }, + { + "epoch": 0.8544714412924564, + "grad_norm": 1.5294227600097656, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8618011474609375, + "num_tokens": 256089052.0, + "step": 6717 + }, + { + "epoch": 0.854598651571047, + "grad_norm": 1.523010492324829, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8576273918151855, + "num_tokens": 256133365.0, + "step": 6718 + }, + { + "epoch": 0.8547258618496375, + "grad_norm": 1.6279865503311157, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8638540506362915, + "num_tokens": 256168244.0, + "step": 6719 + }, + { + "epoch": 0.8548530721282279, + "grad_norm": 1.4914617538452148, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8633791208267212, + "num_tokens": 256205699.0, + "step": 6720 + }, + { + "epoch": 0.8549802824068184, + "grad_norm": 1.5210111141204834, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8715196251869202, + "num_tokens": 256242491.0, + "step": 6721 + }, + { + "epoch": 0.855107492685409, + "grad_norm": 1.4681344032287598, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8873494267463684, + "num_tokens": 256278000.0, + "step": 6722 + }, + { + "epoch": 0.8552347029639995, + "grad_norm": 1.5178278684616089, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8754198551177979, + "num_tokens": 256312855.0, + "step": 6723 + }, + { + "epoch": 0.85536191324259, + "grad_norm": 1.4510273933410645, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8737825155258179, + "num_tokens": 256350818.0, + "step": 6724 + }, + { + "epoch": 0.8554891235211806, + "grad_norm": 1.4816803932189941, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8573721051216125, + "num_tokens": 256389057.0, + "step": 6725 + }, + { + "epoch": 0.855616333799771, + "grad_norm": 1.4032310247421265, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8844761848449707, + "num_tokens": 256430253.0, + "step": 6726 + }, + { + "epoch": 0.8557435440783615, + "grad_norm": 1.5094757080078125, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8815315365791321, + "num_tokens": 256464815.0, + "step": 6727 + }, + { + "epoch": 0.855870754356952, + "grad_norm": 1.551361322402954, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8820793628692627, + "num_tokens": 256496172.0, + "step": 6728 + }, + { + "epoch": 0.8559979646355426, + "grad_norm": 1.5011793375015259, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.871515154838562, + "num_tokens": 256530694.0, + "step": 6729 + }, + { + "epoch": 0.8561251749141331, + "grad_norm": 1.5788884162902832, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8745596408843994, + "num_tokens": 256565582.0, + "step": 6730 + }, + { + "epoch": 0.8562523851927236, + "grad_norm": 1.5802255868911743, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8580365777015686, + "num_tokens": 256599396.0, + "step": 6731 + }, + { + "epoch": 0.856379595471314, + "grad_norm": 1.3993264436721802, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8802649974822998, + "num_tokens": 256638571.0, + "step": 6732 + }, + { + "epoch": 0.8565068057499046, + "grad_norm": 1.463131070137024, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8737826347351074, + "num_tokens": 256679477.0, + "step": 6733 + }, + { + "epoch": 0.8566340160284951, + "grad_norm": 1.4670742750167847, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8563306331634521, + "num_tokens": 256719916.0, + "step": 6734 + }, + { + "epoch": 0.8567612263070856, + "grad_norm": 1.3833221197128296, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8780746459960938, + "num_tokens": 256760620.0, + "step": 6735 + }, + { + "epoch": 0.8568884365856761, + "grad_norm": 1.462628722190857, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8708075284957886, + "num_tokens": 256797963.0, + "step": 6736 + }, + { + "epoch": 0.8570156468642667, + "grad_norm": 1.364605188369751, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8584483861923218, + "num_tokens": 256840554.0, + "step": 6737 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.412846565246582, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8717981576919556, + "num_tokens": 256878536.0, + "step": 6738 + }, + { + "epoch": 0.8572700674214476, + "grad_norm": 1.5375460386276245, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8673532605171204, + "num_tokens": 256914598.0, + "step": 6739 + }, + { + "epoch": 0.8573972777000382, + "grad_norm": 1.3848233222961426, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.874318540096283, + "num_tokens": 256954947.0, + "step": 6740 + }, + { + "epoch": 0.8575244879786287, + "grad_norm": 1.4932732582092285, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8628572821617126, + "num_tokens": 256994709.0, + "step": 6741 + }, + { + "epoch": 0.8576516982572192, + "grad_norm": 1.5603729486465454, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8734308481216431, + "num_tokens": 257028081.0, + "step": 6742 + }, + { + "epoch": 0.8577789085358097, + "grad_norm": 1.5290266275405884, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8491960763931274, + "num_tokens": 257069658.0, + "step": 6743 + }, + { + "epoch": 0.8579061188144002, + "grad_norm": 1.6507699489593506, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.852576494216919, + "num_tokens": 257103565.0, + "step": 6744 + }, + { + "epoch": 0.8580333290929907, + "grad_norm": 1.3622740507125854, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8676148653030396, + "num_tokens": 257147376.0, + "step": 6745 + }, + { + "epoch": 0.8581605393715812, + "grad_norm": 1.4007946252822876, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8613555431365967, + "num_tokens": 257188583.0, + "step": 6746 + }, + { + "epoch": 0.8582877496501717, + "grad_norm": 1.5185699462890625, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.858295202255249, + "num_tokens": 257223878.0, + "step": 6747 + }, + { + "epoch": 0.8584149599287623, + "grad_norm": 1.4158501625061035, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8680087327957153, + "num_tokens": 257263023.0, + "step": 6748 + }, + { + "epoch": 0.8585421702073528, + "grad_norm": 1.5909589529037476, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8640773296356201, + "num_tokens": 257294224.0, + "step": 6749 + }, + { + "epoch": 0.8586693804859432, + "grad_norm": 1.4337778091430664, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8748514652252197, + "num_tokens": 257331776.0, + "step": 6750 + }, + { + "epoch": 0.8587965907645337, + "grad_norm": 1.4415719509124756, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8598515391349792, + "num_tokens": 257372677.0, + "step": 6751 + }, + { + "epoch": 0.8589238010431243, + "grad_norm": 1.5404125452041626, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8706138134002686, + "num_tokens": 257409760.0, + "step": 6752 + }, + { + "epoch": 0.8590510113217148, + "grad_norm": 1.4992607831954956, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8657913208007812, + "num_tokens": 257442109.0, + "step": 6753 + }, + { + "epoch": 0.8591782216003053, + "grad_norm": 1.444080114364624, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8712103366851807, + "num_tokens": 257477602.0, + "step": 6754 + }, + { + "epoch": 0.8593054318788959, + "grad_norm": 1.4955437183380127, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.860863208770752, + "num_tokens": 257516658.0, + "step": 6755 + }, + { + "epoch": 0.8594326421574863, + "grad_norm": 1.3923230171203613, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8752105236053467, + "num_tokens": 257555961.0, + "step": 6756 + }, + { + "epoch": 0.8595598524360768, + "grad_norm": 1.436339259147644, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8615795373916626, + "num_tokens": 257597294.0, + "step": 6757 + }, + { + "epoch": 0.8596870627146673, + "grad_norm": 1.4580857753753662, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8724944591522217, + "num_tokens": 257633848.0, + "step": 6758 + }, + { + "epoch": 0.8598142729932579, + "grad_norm": 1.404499888420105, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8643224835395813, + "num_tokens": 257679440.0, + "step": 6759 + }, + { + "epoch": 0.8599414832718484, + "grad_norm": 1.4377065896987915, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8655449151992798, + "num_tokens": 257716585.0, + "step": 6760 + }, + { + "epoch": 0.8600686935504389, + "grad_norm": 1.4439512491226196, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.859391450881958, + "num_tokens": 257754587.0, + "step": 6761 + }, + { + "epoch": 0.8601959038290294, + "grad_norm": 1.464438557624817, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.861880898475647, + "num_tokens": 257797947.0, + "step": 6762 + }, + { + "epoch": 0.8603231141076199, + "grad_norm": 1.5483307838439941, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8733010292053223, + "num_tokens": 257830609.0, + "step": 6763 + }, + { + "epoch": 0.8604503243862104, + "grad_norm": 1.4615504741668701, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8643776774406433, + "num_tokens": 257871911.0, + "step": 6764 + }, + { + "epoch": 0.8605775346648009, + "grad_norm": 1.6201317310333252, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8416380286216736, + "num_tokens": 257907910.0, + "step": 6765 + }, + { + "epoch": 0.8607047449433914, + "grad_norm": 1.4831995964050293, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8522570133209229, + "num_tokens": 257947570.0, + "step": 6766 + }, + { + "epoch": 0.860831955221982, + "grad_norm": 1.6532719135284424, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8554791212081909, + "num_tokens": 257980018.0, + "step": 6767 + }, + { + "epoch": 0.8609591655005725, + "grad_norm": 1.3921698331832886, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8819305896759033, + "num_tokens": 258020779.0, + "step": 6768 + }, + { + "epoch": 0.8610863757791629, + "grad_norm": 1.4126105308532715, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8641029596328735, + "num_tokens": 258061040.0, + "step": 6769 + }, + { + "epoch": 0.8612135860577534, + "grad_norm": 1.4185258150100708, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8675300478935242, + "num_tokens": 258103773.0, + "step": 6770 + }, + { + "epoch": 0.861340796336344, + "grad_norm": 1.5069165229797363, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8702265024185181, + "num_tokens": 258142120.0, + "step": 6771 + }, + { + "epoch": 0.8614680066149345, + "grad_norm": 1.4379829168319702, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8641298413276672, + "num_tokens": 258183953.0, + "step": 6772 + }, + { + "epoch": 0.861595216893525, + "grad_norm": 1.4744030237197876, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8719114065170288, + "num_tokens": 258221399.0, + "step": 6773 + }, + { + "epoch": 0.8617224271721156, + "grad_norm": 1.5017125606536865, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8618277311325073, + "num_tokens": 258256899.0, + "step": 6774 + }, + { + "epoch": 0.861849637450706, + "grad_norm": 1.5143628120422363, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8765171766281128, + "num_tokens": 258289921.0, + "step": 6775 + }, + { + "epoch": 0.8619768477292965, + "grad_norm": 1.57132887840271, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8840494155883789, + "num_tokens": 258325929.0, + "step": 6776 + }, + { + "epoch": 0.862104058007887, + "grad_norm": 1.5740635395050049, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8762721419334412, + "num_tokens": 258362185.0, + "step": 6777 + }, + { + "epoch": 0.8622312682864776, + "grad_norm": 1.55872642993927, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8550159931182861, + "num_tokens": 258398925.0, + "step": 6778 + }, + { + "epoch": 0.8623584785650681, + "grad_norm": 1.8171886205673218, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8608884811401367, + "num_tokens": 258428148.0, + "step": 6779 + }, + { + "epoch": 0.8624856888436586, + "grad_norm": 1.5554306507110596, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.876567542552948, + "num_tokens": 258467424.0, + "step": 6780 + }, + { + "epoch": 0.862612899122249, + "grad_norm": 1.6281477212905884, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8558621406555176, + "num_tokens": 258504190.0, + "step": 6781 + }, + { + "epoch": 0.8627401094008396, + "grad_norm": 1.5008405447006226, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8776575922966003, + "num_tokens": 258538631.0, + "step": 6782 + }, + { + "epoch": 0.8628673196794301, + "grad_norm": 1.5962337255477905, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8626188039779663, + "num_tokens": 258573545.0, + "step": 6783 + }, + { + "epoch": 0.8629945299580206, + "grad_norm": 1.4035553932189941, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8824682831764221, + "num_tokens": 258612314.0, + "step": 6784 + }, + { + "epoch": 0.8631217402366111, + "grad_norm": 1.5022164583206177, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8582383990287781, + "num_tokens": 258651179.0, + "step": 6785 + }, + { + "epoch": 0.8632489505152017, + "grad_norm": 1.4073822498321533, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.887069046497345, + "num_tokens": 258689071.0, + "step": 6786 + }, + { + "epoch": 0.8633761607937921, + "grad_norm": 1.4493873119354248, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8648840188980103, + "num_tokens": 258733136.0, + "step": 6787 + }, + { + "epoch": 0.8635033710723826, + "grad_norm": 1.4127453565597534, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8607178926467896, + "num_tokens": 258771391.0, + "step": 6788 + }, + { + "epoch": 0.8636305813509731, + "grad_norm": 1.4924516677856445, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8762037754058838, + "num_tokens": 258806626.0, + "step": 6789 + }, + { + "epoch": 0.8637577916295637, + "grad_norm": 1.3687297105789185, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8804872035980225, + "num_tokens": 258847184.0, + "step": 6790 + }, + { + "epoch": 0.8638850019081542, + "grad_norm": 1.5071320533752441, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8637353181838989, + "num_tokens": 258885613.0, + "step": 6791 + }, + { + "epoch": 0.8640122121867447, + "grad_norm": 1.409730315208435, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.874408483505249, + "num_tokens": 258926716.0, + "step": 6792 + }, + { + "epoch": 0.8641394224653351, + "grad_norm": 1.5210626125335693, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8521870374679565, + "num_tokens": 258964541.0, + "step": 6793 + }, + { + "epoch": 0.8642666327439257, + "grad_norm": 1.5683927536010742, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8650456666946411, + "num_tokens": 259002717.0, + "step": 6794 + }, + { + "epoch": 0.8643938430225162, + "grad_norm": 1.5924500226974487, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8750038743019104, + "num_tokens": 259039563.0, + "step": 6795 + }, + { + "epoch": 0.8645210533011067, + "grad_norm": 1.447487235069275, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8686081171035767, + "num_tokens": 259081185.0, + "step": 6796 + }, + { + "epoch": 0.8646482635796973, + "grad_norm": 1.5308856964111328, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8720158338546753, + "num_tokens": 259115652.0, + "step": 6797 + }, + { + "epoch": 0.8647754738582878, + "grad_norm": 1.4351879358291626, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8743584752082825, + "num_tokens": 259154229.0, + "step": 6798 + }, + { + "epoch": 0.8649026841368782, + "grad_norm": 1.4690743684768677, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8805898427963257, + "num_tokens": 259191416.0, + "step": 6799 + }, + { + "epoch": 0.8650298944154687, + "grad_norm": 1.536057472229004, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8621198534965515, + "num_tokens": 259228455.0, + "step": 6800 + }, + { + "epoch": 0.8651571046940593, + "grad_norm": 1.6332802772521973, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8471308946609497, + "num_tokens": 259269558.0, + "step": 6801 + }, + { + "epoch": 0.8652843149726498, + "grad_norm": 1.4412437677383423, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8700332045555115, + "num_tokens": 259308678.0, + "step": 6802 + }, + { + "epoch": 0.8654115252512403, + "grad_norm": 1.4898463487625122, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.874660849571228, + "num_tokens": 259343797.0, + "step": 6803 + }, + { + "epoch": 0.8655387355298308, + "grad_norm": 1.4029666185379028, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.879481315612793, + "num_tokens": 259382111.0, + "step": 6804 + }, + { + "epoch": 0.8656659458084213, + "grad_norm": 1.6290596723556519, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8595093488693237, + "num_tokens": 259415738.0, + "step": 6805 + }, + { + "epoch": 0.8657931560870118, + "grad_norm": 1.447972059249878, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8665710687637329, + "num_tokens": 259456906.0, + "step": 6806 + }, + { + "epoch": 0.8659203663656023, + "grad_norm": 1.3671690225601196, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8906359672546387, + "num_tokens": 259494330.0, + "step": 6807 + }, + { + "epoch": 0.8660475766441929, + "grad_norm": 1.610713243484497, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8783702254295349, + "num_tokens": 259527335.0, + "step": 6808 + }, + { + "epoch": 0.8661747869227834, + "grad_norm": 1.4880497455596924, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8766868114471436, + "num_tokens": 259560988.0, + "step": 6809 + }, + { + "epoch": 0.8663019972013739, + "grad_norm": 1.4337269067764282, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8779813647270203, + "num_tokens": 259598020.0, + "step": 6810 + }, + { + "epoch": 0.8664292074799644, + "grad_norm": 1.327397108078003, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.87473464012146, + "num_tokens": 259640840.0, + "step": 6811 + }, + { + "epoch": 0.8665564177585549, + "grad_norm": 1.3337267637252808, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8725427985191345, + "num_tokens": 259682056.0, + "step": 6812 + }, + { + "epoch": 0.8666836280371454, + "grad_norm": 1.4944835901260376, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8663282990455627, + "num_tokens": 259723311.0, + "step": 6813 + }, + { + "epoch": 0.8668108383157359, + "grad_norm": 1.5484073162078857, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8676427006721497, + "num_tokens": 259760106.0, + "step": 6814 + }, + { + "epoch": 0.8669380485943264, + "grad_norm": 1.4569146633148193, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8712486624717712, + "num_tokens": 259798550.0, + "step": 6815 + }, + { + "epoch": 0.867065258872917, + "grad_norm": 1.3891607522964478, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8740854263305664, + "num_tokens": 259841507.0, + "step": 6816 + }, + { + "epoch": 0.8671924691515075, + "grad_norm": 1.4120243787765503, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8576964139938354, + "num_tokens": 259881868.0, + "step": 6817 + }, + { + "epoch": 0.8673196794300979, + "grad_norm": 1.6102139949798584, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8699795007705688, + "num_tokens": 259912446.0, + "step": 6818 + }, + { + "epoch": 0.8674468897086884, + "grad_norm": 1.4894009828567505, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8677340745925903, + "num_tokens": 259956510.0, + "step": 6819 + }, + { + "epoch": 0.867574099987279, + "grad_norm": 1.394898533821106, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8660405874252319, + "num_tokens": 259997912.0, + "step": 6820 + }, + { + "epoch": 0.8677013102658695, + "grad_norm": 1.5575039386749268, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8541070222854614, + "num_tokens": 260030463.0, + "step": 6821 + }, + { + "epoch": 0.86782852054446, + "grad_norm": 1.4296574592590332, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8635467290878296, + "num_tokens": 260072144.0, + "step": 6822 + }, + { + "epoch": 0.8679557308230506, + "grad_norm": 1.4698365926742554, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8863134384155273, + "num_tokens": 260109434.0, + "step": 6823 + }, + { + "epoch": 0.868082941101641, + "grad_norm": 1.4537914991378784, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8730463981628418, + "num_tokens": 260147018.0, + "step": 6824 + }, + { + "epoch": 0.8682101513802315, + "grad_norm": 1.4508609771728516, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8685203790664673, + "num_tokens": 260183593.0, + "step": 6825 + }, + { + "epoch": 0.868337361658822, + "grad_norm": 1.518886923789978, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8670076131820679, + "num_tokens": 260220567.0, + "step": 6826 + }, + { + "epoch": 0.8684645719374126, + "grad_norm": 1.5462231636047363, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8699970245361328, + "num_tokens": 260257939.0, + "step": 6827 + }, + { + "epoch": 0.8685917822160031, + "grad_norm": 1.4436895847320557, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8622540235519409, + "num_tokens": 260300234.0, + "step": 6828 + }, + { + "epoch": 0.8687189924945936, + "grad_norm": 1.5934627056121826, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8722227811813354, + "num_tokens": 260339172.0, + "step": 6829 + }, + { + "epoch": 0.868846202773184, + "grad_norm": 1.435585856437683, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8747299909591675, + "num_tokens": 260377115.0, + "step": 6830 + }, + { + "epoch": 0.8689734130517746, + "grad_norm": 1.390899896621704, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8661450147628784, + "num_tokens": 260418740.0, + "step": 6831 + }, + { + "epoch": 0.8691006233303651, + "grad_norm": 1.5117204189300537, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8365104794502258, + "num_tokens": 260468537.0, + "step": 6832 + }, + { + "epoch": 0.8692278336089556, + "grad_norm": 1.4944510459899902, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8834648132324219, + "num_tokens": 260505602.0, + "step": 6833 + }, + { + "epoch": 0.8693550438875461, + "grad_norm": 1.5203969478607178, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8705558776855469, + "num_tokens": 260539064.0, + "step": 6834 + }, + { + "epoch": 0.8694822541661367, + "grad_norm": 1.4725244045257568, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8802776336669922, + "num_tokens": 260572235.0, + "step": 6835 + }, + { + "epoch": 0.8696094644447271, + "grad_norm": 1.4430005550384521, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8634618520736694, + "num_tokens": 260620092.0, + "step": 6836 + }, + { + "epoch": 0.8697366747233176, + "grad_norm": 1.3920350074768066, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8904183506965637, + "num_tokens": 260657769.0, + "step": 6837 + }, + { + "epoch": 0.8698638850019081, + "grad_norm": 1.5332750082015991, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8780229091644287, + "num_tokens": 260690954.0, + "step": 6838 + }, + { + "epoch": 0.8699910952804987, + "grad_norm": 1.5198713541030884, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8639016151428223, + "num_tokens": 260729376.0, + "step": 6839 + }, + { + "epoch": 0.8701183055590892, + "grad_norm": 1.3831063508987427, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8645980358123779, + "num_tokens": 260772100.0, + "step": 6840 + }, + { + "epoch": 0.8702455158376797, + "grad_norm": 1.4866070747375488, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8668831586837769, + "num_tokens": 260810080.0, + "step": 6841 + }, + { + "epoch": 0.8703727261162701, + "grad_norm": 1.4968631267547607, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8580155968666077, + "num_tokens": 260850133.0, + "step": 6842 + }, + { + "epoch": 0.8704999363948607, + "grad_norm": 1.3902772665023804, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.867361307144165, + "num_tokens": 260891553.0, + "step": 6843 + }, + { + "epoch": 0.8706271466734512, + "grad_norm": 1.4267935752868652, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8796859979629517, + "num_tokens": 260929881.0, + "step": 6844 + }, + { + "epoch": 0.8707543569520417, + "grad_norm": 1.5153532028198242, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.855562686920166, + "num_tokens": 260968238.0, + "step": 6845 + }, + { + "epoch": 0.8708815672306323, + "grad_norm": 1.4476032257080078, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8656404614448547, + "num_tokens": 261008107.0, + "step": 6846 + }, + { + "epoch": 0.8710087775092228, + "grad_norm": 1.315421462059021, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8794128894805908, + "num_tokens": 261052354.0, + "step": 6847 + }, + { + "epoch": 0.8711359877878132, + "grad_norm": 1.5363625288009644, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8680917024612427, + "num_tokens": 261090519.0, + "step": 6848 + }, + { + "epoch": 0.8712631980664037, + "grad_norm": 1.4053620100021362, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8659364581108093, + "num_tokens": 261132654.0, + "step": 6849 + }, + { + "epoch": 0.8713904083449943, + "grad_norm": 1.333578109741211, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8742516040802002, + "num_tokens": 261178942.0, + "step": 6850 + }, + { + "epoch": 0.8715176186235848, + "grad_norm": 1.4762314558029175, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8614304065704346, + "num_tokens": 261221254.0, + "step": 6851 + }, + { + "epoch": 0.8716448289021753, + "grad_norm": 1.7423086166381836, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8485997915267944, + "num_tokens": 261258488.0, + "step": 6852 + }, + { + "epoch": 0.8717720391807658, + "grad_norm": 1.468327522277832, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8484032154083252, + "num_tokens": 261300488.0, + "step": 6853 + }, + { + "epoch": 0.8718992494593563, + "grad_norm": 1.804286003112793, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8811092376708984, + "num_tokens": 261337711.0, + "step": 6854 + }, + { + "epoch": 0.8720264597379468, + "grad_norm": 1.4699076414108276, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.876481294631958, + "num_tokens": 261375297.0, + "step": 6855 + }, + { + "epoch": 0.8721536700165373, + "grad_norm": 1.4220294952392578, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8746821880340576, + "num_tokens": 261416395.0, + "step": 6856 + }, + { + "epoch": 0.8722808802951278, + "grad_norm": 1.6918236017227173, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8669195175170898, + "num_tokens": 261452116.0, + "step": 6857 + }, + { + "epoch": 0.8724080905737184, + "grad_norm": 1.5043519735336304, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8727734684944153, + "num_tokens": 261486606.0, + "step": 6858 + }, + { + "epoch": 0.8725353008523089, + "grad_norm": 1.615834355354309, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8661015033721924, + "num_tokens": 261523188.0, + "step": 6859 + }, + { + "epoch": 0.8726625111308994, + "grad_norm": 1.5323551893234253, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8430943489074707, + "num_tokens": 261563866.0, + "step": 6860 + }, + { + "epoch": 0.8727897214094898, + "grad_norm": 1.4964308738708496, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8618333339691162, + "num_tokens": 261603162.0, + "step": 6861 + }, + { + "epoch": 0.8729169316880804, + "grad_norm": 1.5812028646469116, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8471588492393494, + "num_tokens": 261643398.0, + "step": 6862 + }, + { + "epoch": 0.8730441419666709, + "grad_norm": 1.513586163520813, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8690134286880493, + "num_tokens": 261677973.0, + "step": 6863 + }, + { + "epoch": 0.8731713522452614, + "grad_norm": 1.3094154596328735, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8759924173355103, + "num_tokens": 261723364.0, + "step": 6864 + }, + { + "epoch": 0.873298562523852, + "grad_norm": 1.5131957530975342, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8739190101623535, + "num_tokens": 261759234.0, + "step": 6865 + }, + { + "epoch": 0.8734257728024425, + "grad_norm": 1.4592516422271729, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8738656640052795, + "num_tokens": 261796485.0, + "step": 6866 + }, + { + "epoch": 0.8735529830810329, + "grad_norm": 1.5113966464996338, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.858896017074585, + "num_tokens": 261835031.0, + "step": 6867 + }, + { + "epoch": 0.8736801933596234, + "grad_norm": 1.4765163660049438, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8697582483291626, + "num_tokens": 261876892.0, + "step": 6868 + }, + { + "epoch": 0.873807403638214, + "grad_norm": 1.494350552558899, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8714515566825867, + "num_tokens": 261916006.0, + "step": 6869 + }, + { + "epoch": 0.8739346139168045, + "grad_norm": 1.431931734085083, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8766290545463562, + "num_tokens": 261952324.0, + "step": 6870 + }, + { + "epoch": 0.874061824195395, + "grad_norm": 1.4982858896255493, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8724132776260376, + "num_tokens": 261995706.0, + "step": 6871 + }, + { + "epoch": 0.8741890344739855, + "grad_norm": 1.6327333450317383, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8583256006240845, + "num_tokens": 262031375.0, + "step": 6872 + }, + { + "epoch": 0.874316244752576, + "grad_norm": 1.428942322731018, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8707451820373535, + "num_tokens": 262071932.0, + "step": 6873 + }, + { + "epoch": 0.8744434550311665, + "grad_norm": 1.402282476425171, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8804400563240051, + "num_tokens": 262107596.0, + "step": 6874 + }, + { + "epoch": 0.874570665309757, + "grad_norm": 1.4356368780136108, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8705525398254395, + "num_tokens": 262149157.0, + "step": 6875 + }, + { + "epoch": 0.8746978755883476, + "grad_norm": 1.4757484197616577, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8662810325622559, + "num_tokens": 262186520.0, + "step": 6876 + }, + { + "epoch": 0.8748250858669381, + "grad_norm": 1.465738296508789, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8523951768875122, + "num_tokens": 262227366.0, + "step": 6877 + }, + { + "epoch": 0.8749522961455286, + "grad_norm": 1.5582224130630493, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8616528511047363, + "num_tokens": 262264314.0, + "step": 6878 + }, + { + "epoch": 0.875079506424119, + "grad_norm": 1.5152451992034912, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8712812662124634, + "num_tokens": 262301444.0, + "step": 6879 + }, + { + "epoch": 0.8752067167027096, + "grad_norm": 1.473300814628601, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8690499067306519, + "num_tokens": 262341943.0, + "step": 6880 + }, + { + "epoch": 0.8753339269813001, + "grad_norm": 1.5898725986480713, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8732463121414185, + "num_tokens": 262376007.0, + "step": 6881 + }, + { + "epoch": 0.8754611372598906, + "grad_norm": 1.4123213291168213, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8654007315635681, + "num_tokens": 262417103.0, + "step": 6882 + }, + { + "epoch": 0.8755883475384811, + "grad_norm": 1.4246224164962769, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8617285490036011, + "num_tokens": 262458175.0, + "step": 6883 + }, + { + "epoch": 0.8757155578170717, + "grad_norm": 1.4219316244125366, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8518688678741455, + "num_tokens": 262500782.0, + "step": 6884 + }, + { + "epoch": 0.8758427680956621, + "grad_norm": 1.4538170099258423, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8672891855239868, + "num_tokens": 262541939.0, + "step": 6885 + }, + { + "epoch": 0.8759699783742526, + "grad_norm": 1.5722819566726685, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8685128688812256, + "num_tokens": 262577593.0, + "step": 6886 + }, + { + "epoch": 0.8760971886528431, + "grad_norm": 1.467171311378479, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8627023100852966, + "num_tokens": 262616041.0, + "step": 6887 + }, + { + "epoch": 0.8762243989314337, + "grad_norm": 1.383882999420166, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8729841709136963, + "num_tokens": 262655876.0, + "step": 6888 + }, + { + "epoch": 0.8763516092100242, + "grad_norm": 1.4097537994384766, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8637780547142029, + "num_tokens": 262697078.0, + "step": 6889 + }, + { + "epoch": 0.8764788194886147, + "grad_norm": 1.6252052783966064, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8701298236846924, + "num_tokens": 262729584.0, + "step": 6890 + }, + { + "epoch": 0.8766060297672051, + "grad_norm": 1.4557379484176636, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8541518449783325, + "num_tokens": 262768800.0, + "step": 6891 + }, + { + "epoch": 0.8767332400457957, + "grad_norm": 1.4825350046157837, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8601682186126709, + "num_tokens": 262811246.0, + "step": 6892 + }, + { + "epoch": 0.8768604503243862, + "grad_norm": 1.6372045278549194, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8480474352836609, + "num_tokens": 262844771.0, + "step": 6893 + }, + { + "epoch": 0.8769876606029767, + "grad_norm": 1.4398611783981323, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8570272922515869, + "num_tokens": 262884116.0, + "step": 6894 + }, + { + "epoch": 0.8771148708815673, + "grad_norm": 1.5513291358947754, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.86480712890625, + "num_tokens": 262919905.0, + "step": 6895 + }, + { + "epoch": 0.8772420811601578, + "grad_norm": 1.4169974327087402, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8676512241363525, + "num_tokens": 262960578.0, + "step": 6896 + }, + { + "epoch": 0.8773692914387482, + "grad_norm": 1.5498591661453247, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8637655973434448, + "num_tokens": 262996601.0, + "step": 6897 + }, + { + "epoch": 0.8774965017173387, + "grad_norm": 1.4803231954574585, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8675066232681274, + "num_tokens": 263032810.0, + "step": 6898 + }, + { + "epoch": 0.8776237119959293, + "grad_norm": 1.527008295059204, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8412723541259766, + "num_tokens": 263073050.0, + "step": 6899 + }, + { + "epoch": 0.8777509222745198, + "grad_norm": 1.5209159851074219, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8674283623695374, + "num_tokens": 263111597.0, + "step": 6900 + }, + { + "epoch": 0.8778781325531103, + "grad_norm": 1.4554060697555542, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8785372972488403, + "num_tokens": 263150477.0, + "step": 6901 + }, + { + "epoch": 0.8780053428317008, + "grad_norm": 1.4282169342041016, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8851959109306335, + "num_tokens": 263190529.0, + "step": 6902 + }, + { + "epoch": 0.8781325531102913, + "grad_norm": 1.4879310131072998, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8788714408874512, + "num_tokens": 263224225.0, + "step": 6903 + }, + { + "epoch": 0.8782597633888818, + "grad_norm": 1.6878455877304077, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.862953245639801, + "num_tokens": 263254471.0, + "step": 6904 + }, + { + "epoch": 0.8783869736674723, + "grad_norm": 1.4034390449523926, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8686052560806274, + "num_tokens": 263293675.0, + "step": 6905 + }, + { + "epoch": 0.8785141839460628, + "grad_norm": 1.382066011428833, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8742532730102539, + "num_tokens": 263335069.0, + "step": 6906 + }, + { + "epoch": 0.8786413942246534, + "grad_norm": 1.3870973587036133, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8895021080970764, + "num_tokens": 263371141.0, + "step": 6907 + }, + { + "epoch": 0.8787686045032439, + "grad_norm": 1.4605423212051392, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8658884763717651, + "num_tokens": 263408085.0, + "step": 6908 + }, + { + "epoch": 0.8788958147818343, + "grad_norm": 1.3257354497909546, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8781543970108032, + "num_tokens": 263452306.0, + "step": 6909 + }, + { + "epoch": 0.8790230250604248, + "grad_norm": 1.5495270490646362, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8719092607498169, + "num_tokens": 263487567.0, + "step": 6910 + }, + { + "epoch": 0.8791502353390154, + "grad_norm": 1.4974665641784668, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8678285479545593, + "num_tokens": 263524851.0, + "step": 6911 + }, + { + "epoch": 0.8792774456176059, + "grad_norm": 1.4586830139160156, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.871353030204773, + "num_tokens": 263564008.0, + "step": 6912 + }, + { + "epoch": 0.8794046558961964, + "grad_norm": 1.337896704673767, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8703012466430664, + "num_tokens": 263610572.0, + "step": 6913 + }, + { + "epoch": 0.879531866174787, + "grad_norm": 1.4077842235565186, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8632375597953796, + "num_tokens": 263653280.0, + "step": 6914 + }, + { + "epoch": 0.8796590764533775, + "grad_norm": 1.55910325050354, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8766504526138306, + "num_tokens": 263688077.0, + "step": 6915 + }, + { + "epoch": 0.8797862867319679, + "grad_norm": 1.593783974647522, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8696702718734741, + "num_tokens": 263719254.0, + "step": 6916 + }, + { + "epoch": 0.8799134970105584, + "grad_norm": 1.5793133974075317, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8545147180557251, + "num_tokens": 263760354.0, + "step": 6917 + }, + { + "epoch": 0.880040707289149, + "grad_norm": 1.4018089771270752, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8800550699234009, + "num_tokens": 263797290.0, + "step": 6918 + }, + { + "epoch": 0.8801679175677395, + "grad_norm": 1.5340938568115234, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8609318137168884, + "num_tokens": 263837802.0, + "step": 6919 + }, + { + "epoch": 0.88029512784633, + "grad_norm": 1.5006239414215088, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8676869869232178, + "num_tokens": 263874611.0, + "step": 6920 + }, + { + "epoch": 0.8804223381249205, + "grad_norm": 1.4105676412582397, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8736926317214966, + "num_tokens": 263913872.0, + "step": 6921 + }, + { + "epoch": 0.880549548403511, + "grad_norm": 1.4740427732467651, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8751528263092041, + "num_tokens": 263947002.0, + "step": 6922 + }, + { + "epoch": 0.8806767586821015, + "grad_norm": 1.4218755960464478, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8791432976722717, + "num_tokens": 263988221.0, + "step": 6923 + }, + { + "epoch": 0.880803968960692, + "grad_norm": 1.4569944143295288, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8680839538574219, + "num_tokens": 264026166.0, + "step": 6924 + }, + { + "epoch": 0.8809311792392825, + "grad_norm": 1.5701913833618164, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8501952886581421, + "num_tokens": 264064279.0, + "step": 6925 + }, + { + "epoch": 0.8810583895178731, + "grad_norm": 1.5992920398712158, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.858597993850708, + "num_tokens": 264098638.0, + "step": 6926 + }, + { + "epoch": 0.8811855997964636, + "grad_norm": 1.3848158121109009, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8753501176834106, + "num_tokens": 264136601.0, + "step": 6927 + }, + { + "epoch": 0.881312810075054, + "grad_norm": 1.3971055746078491, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8678643703460693, + "num_tokens": 264178543.0, + "step": 6928 + }, + { + "epoch": 0.8814400203536445, + "grad_norm": 1.580698013305664, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.846672773361206, + "num_tokens": 264214572.0, + "step": 6929 + }, + { + "epoch": 0.8815672306322351, + "grad_norm": 1.510308027267456, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8611491918563843, + "num_tokens": 264255191.0, + "step": 6930 + }, + { + "epoch": 0.8816944409108256, + "grad_norm": 1.4664456844329834, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8613789677619934, + "num_tokens": 264292985.0, + "step": 6931 + }, + { + "epoch": 0.8818216511894161, + "grad_norm": 1.352426528930664, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8743677735328674, + "num_tokens": 264338050.0, + "step": 6932 + }, + { + "epoch": 0.8819488614680067, + "grad_norm": 1.3805599212646484, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8640174269676208, + "num_tokens": 264378871.0, + "step": 6933 + }, + { + "epoch": 0.8820760717465971, + "grad_norm": 1.6481761932373047, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8754891753196716, + "num_tokens": 264408737.0, + "step": 6934 + }, + { + "epoch": 0.8822032820251876, + "grad_norm": 1.421130657196045, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8702566623687744, + "num_tokens": 264447414.0, + "step": 6935 + }, + { + "epoch": 0.8823304923037781, + "grad_norm": 1.5160984992980957, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8538784980773926, + "num_tokens": 264488056.0, + "step": 6936 + }, + { + "epoch": 0.8824577025823687, + "grad_norm": 1.2944079637527466, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8702142238616943, + "num_tokens": 264536591.0, + "step": 6937 + }, + { + "epoch": 0.8825849128609592, + "grad_norm": 1.4707902669906616, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8693117499351501, + "num_tokens": 264578427.0, + "step": 6938 + }, + { + "epoch": 0.8827121231395497, + "grad_norm": 1.6436967849731445, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8583046197891235, + "num_tokens": 264614381.0, + "step": 6939 + }, + { + "epoch": 0.8828393334181401, + "grad_norm": 1.393092155456543, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8774491548538208, + "num_tokens": 264655174.0, + "step": 6940 + }, + { + "epoch": 0.8829665436967307, + "grad_norm": 1.5256065130233765, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8592829704284668, + "num_tokens": 264694697.0, + "step": 6941 + }, + { + "epoch": 0.8830937539753212, + "grad_norm": 1.5019936561584473, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.876977801322937, + "num_tokens": 264729681.0, + "step": 6942 + }, + { + "epoch": 0.8832209642539117, + "grad_norm": 1.5605063438415527, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8600974082946777, + "num_tokens": 264766969.0, + "step": 6943 + }, + { + "epoch": 0.8833481745325023, + "grad_norm": 1.5480984449386597, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.865957498550415, + "num_tokens": 264802134.0, + "step": 6944 + }, + { + "epoch": 0.8834753848110928, + "grad_norm": 1.3558235168457031, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8616113662719727, + "num_tokens": 264846837.0, + "step": 6945 + }, + { + "epoch": 0.8836025950896832, + "grad_norm": 1.5063283443450928, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8749803304672241, + "num_tokens": 264881426.0, + "step": 6946 + }, + { + "epoch": 0.8837298053682737, + "grad_norm": 1.4208341836929321, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8658121228218079, + "num_tokens": 264924377.0, + "step": 6947 + }, + { + "epoch": 0.8838570156468643, + "grad_norm": 1.4795619249343872, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8674476742744446, + "num_tokens": 264962403.0, + "step": 6948 + }, + { + "epoch": 0.8839842259254548, + "grad_norm": 1.4739563465118408, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8568230867385864, + "num_tokens": 265001140.0, + "step": 6949 + }, + { + "epoch": 0.8841114362040453, + "grad_norm": 1.4368369579315186, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8722355365753174, + "num_tokens": 265038566.0, + "step": 6950 + }, + { + "epoch": 0.8842386464826358, + "grad_norm": 1.440657138824463, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8655096292495728, + "num_tokens": 265077915.0, + "step": 6951 + }, + { + "epoch": 0.8843658567612263, + "grad_norm": 1.3854339122772217, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8796955943107605, + "num_tokens": 265118653.0, + "step": 6952 + }, + { + "epoch": 0.8844930670398168, + "grad_norm": 1.4713809490203857, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8719000816345215, + "num_tokens": 265157781.0, + "step": 6953 + }, + { + "epoch": 0.8846202773184073, + "grad_norm": 1.556650996208191, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8687149286270142, + "num_tokens": 265195828.0, + "step": 6954 + }, + { + "epoch": 0.8847474875969978, + "grad_norm": 1.578700304031372, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.869169294834137, + "num_tokens": 265232064.0, + "step": 6955 + }, + { + "epoch": 0.8848746978755884, + "grad_norm": 1.6262880563735962, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8599919080734253, + "num_tokens": 265267027.0, + "step": 6956 + }, + { + "epoch": 0.8850019081541789, + "grad_norm": 1.5348541736602783, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8558869361877441, + "num_tokens": 265305046.0, + "step": 6957 + }, + { + "epoch": 0.8851291184327693, + "grad_norm": 1.4893172979354858, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8647083044052124, + "num_tokens": 265343651.0, + "step": 6958 + }, + { + "epoch": 0.8852563287113598, + "grad_norm": 1.4988700151443481, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.862176239490509, + "num_tokens": 265379985.0, + "step": 6959 + }, + { + "epoch": 0.8853835389899504, + "grad_norm": 1.5812958478927612, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8651270866394043, + "num_tokens": 265414307.0, + "step": 6960 + }, + { + "epoch": 0.8855107492685409, + "grad_norm": 1.433814525604248, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8671631217002869, + "num_tokens": 265455515.0, + "step": 6961 + }, + { + "epoch": 0.8856379595471314, + "grad_norm": 1.579444169998169, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8859716057777405, + "num_tokens": 265488094.0, + "step": 6962 + }, + { + "epoch": 0.885765169825722, + "grad_norm": 1.5253320932388306, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8595930933952332, + "num_tokens": 265526810.0, + "step": 6963 + }, + { + "epoch": 0.8858923801043125, + "grad_norm": 1.3883845806121826, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8777279853820801, + "num_tokens": 265567006.0, + "step": 6964 + }, + { + "epoch": 0.8860195903829029, + "grad_norm": 1.5821462869644165, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.863512396812439, + "num_tokens": 265603025.0, + "step": 6965 + }, + { + "epoch": 0.8861468006614934, + "grad_norm": 1.5158878564834595, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8521263003349304, + "num_tokens": 265640007.0, + "step": 6966 + }, + { + "epoch": 0.886274010940084, + "grad_norm": 1.4075108766555786, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8605443835258484, + "num_tokens": 265682929.0, + "step": 6967 + }, + { + "epoch": 0.8864012212186745, + "grad_norm": 1.4045922756195068, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8785020112991333, + "num_tokens": 265724469.0, + "step": 6968 + }, + { + "epoch": 0.886528431497265, + "grad_norm": 1.6818293333053589, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8722240328788757, + "num_tokens": 265755678.0, + "step": 6969 + }, + { + "epoch": 0.8866556417758555, + "grad_norm": 1.722550630569458, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8339591026306152, + "num_tokens": 265796566.0, + "step": 6970 + }, + { + "epoch": 0.886782852054446, + "grad_norm": 1.417747974395752, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8841400146484375, + "num_tokens": 265835066.0, + "step": 6971 + }, + { + "epoch": 0.8869100623330365, + "grad_norm": 1.335425615310669, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8732318878173828, + "num_tokens": 265877759.0, + "step": 6972 + }, + { + "epoch": 0.887037272611627, + "grad_norm": 1.5064550638198853, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8758321404457092, + "num_tokens": 265911331.0, + "step": 6973 + }, + { + "epoch": 0.8871644828902175, + "grad_norm": 1.380279779434204, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8669370412826538, + "num_tokens": 265952968.0, + "step": 6974 + }, + { + "epoch": 0.8872916931688081, + "grad_norm": 1.5267895460128784, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8681902289390564, + "num_tokens": 265988456.0, + "step": 6975 + }, + { + "epoch": 0.8874189034473986, + "grad_norm": 1.6793639659881592, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8704445362091064, + "num_tokens": 266024757.0, + "step": 6976 + }, + { + "epoch": 0.887546113725989, + "grad_norm": 1.4731452465057373, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8625551462173462, + "num_tokens": 266063942.0, + "step": 6977 + }, + { + "epoch": 0.8876733240045795, + "grad_norm": 1.4513922929763794, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8793993592262268, + "num_tokens": 266096605.0, + "step": 6978 + }, + { + "epoch": 0.8878005342831701, + "grad_norm": 1.4129031896591187, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8669881820678711, + "num_tokens": 266136315.0, + "step": 6979 + }, + { + "epoch": 0.8879277445617606, + "grad_norm": 1.5430375337600708, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8692628145217896, + "num_tokens": 266168642.0, + "step": 6980 + }, + { + "epoch": 0.8880549548403511, + "grad_norm": 1.4422211647033691, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8612229228019714, + "num_tokens": 266208614.0, + "step": 6981 + }, + { + "epoch": 0.8881821651189417, + "grad_norm": 1.6692276000976562, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8665632009506226, + "num_tokens": 266238944.0, + "step": 6982 + }, + { + "epoch": 0.8883093753975321, + "grad_norm": 1.337033987045288, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8833106756210327, + "num_tokens": 266279034.0, + "step": 6983 + }, + { + "epoch": 0.8884365856761226, + "grad_norm": 1.5295268297195435, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8587121963500977, + "num_tokens": 266316483.0, + "step": 6984 + }, + { + "epoch": 0.8885637959547131, + "grad_norm": 1.3506460189819336, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8697777390480042, + "num_tokens": 266359588.0, + "step": 6985 + }, + { + "epoch": 0.8886910062333037, + "grad_norm": 1.4561138153076172, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8507406115531921, + "num_tokens": 266398569.0, + "step": 6986 + }, + { + "epoch": 0.8888182165118942, + "grad_norm": 1.495607852935791, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8585502505302429, + "num_tokens": 266437136.0, + "step": 6987 + }, + { + "epoch": 0.8889454267904847, + "grad_norm": 1.439895749092102, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8756061792373657, + "num_tokens": 266478071.0, + "step": 6988 + }, + { + "epoch": 0.8890726370690751, + "grad_norm": 1.4691317081451416, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8644095063209534, + "num_tokens": 266518212.0, + "step": 6989 + }, + { + "epoch": 0.8891998473476657, + "grad_norm": 1.4826154708862305, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8654571771621704, + "num_tokens": 266556674.0, + "step": 6990 + }, + { + "epoch": 0.8893270576262562, + "grad_norm": 1.5509791374206543, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.876888632774353, + "num_tokens": 266588801.0, + "step": 6991 + }, + { + "epoch": 0.8894542679048467, + "grad_norm": 1.4526548385620117, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8595167398452759, + "num_tokens": 266628051.0, + "step": 6992 + }, + { + "epoch": 0.8895814781834372, + "grad_norm": 1.5394561290740967, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8568359613418579, + "num_tokens": 266666918.0, + "step": 6993 + }, + { + "epoch": 0.8897086884620278, + "grad_norm": 1.5427589416503906, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.865424633026123, + "num_tokens": 266703411.0, + "step": 6994 + }, + { + "epoch": 0.8898358987406182, + "grad_norm": 1.480062484741211, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8538614511489868, + "num_tokens": 266743394.0, + "step": 6995 + }, + { + "epoch": 0.8899631090192087, + "grad_norm": 1.3662465810775757, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8909982442855835, + "num_tokens": 266784645.0, + "step": 6996 + }, + { + "epoch": 0.8900903192977992, + "grad_norm": 1.5174182653427124, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8629252910614014, + "num_tokens": 266820387.0, + "step": 6997 + }, + { + "epoch": 0.8902175295763898, + "grad_norm": 1.373570442199707, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.863953709602356, + "num_tokens": 266863217.0, + "step": 6998 + }, + { + "epoch": 0.8903447398549803, + "grad_norm": 1.3807861804962158, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8822245001792908, + "num_tokens": 266904414.0, + "step": 6999 + }, + { + "epoch": 0.8904719501335708, + "grad_norm": 1.4224056005477905, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8682304620742798, + "num_tokens": 266946709.0, + "step": 7000 + }, + { + "epoch": 0.8905991604121613, + "grad_norm": 1.6059257984161377, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8555420637130737, + "num_tokens": 266981904.0, + "step": 7001 + }, + { + "epoch": 0.8907263706907518, + "grad_norm": 1.5734597444534302, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8388643264770508, + "num_tokens": 267020351.0, + "step": 7002 + }, + { + "epoch": 0.8908535809693423, + "grad_norm": 1.4570269584655762, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8675285577774048, + "num_tokens": 267059720.0, + "step": 7003 + }, + { + "epoch": 0.8909807912479328, + "grad_norm": 1.5258572101593018, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8533626198768616, + "num_tokens": 267101440.0, + "step": 7004 + }, + { + "epoch": 0.8911080015265234, + "grad_norm": 1.37392258644104, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8827527761459351, + "num_tokens": 267141146.0, + "step": 7005 + }, + { + "epoch": 0.8912352118051139, + "grad_norm": 1.6027247905731201, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8726792931556702, + "num_tokens": 267174321.0, + "step": 7006 + }, + { + "epoch": 0.8913624220837043, + "grad_norm": 1.4677881002426147, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.862062394618988, + "num_tokens": 267214405.0, + "step": 7007 + }, + { + "epoch": 0.8914896323622948, + "grad_norm": 1.2890933752059937, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8856883645057678, + "num_tokens": 267253174.0, + "step": 7008 + }, + { + "epoch": 0.8916168426408854, + "grad_norm": 1.4075769186019897, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8563266396522522, + "num_tokens": 267299098.0, + "step": 7009 + }, + { + "epoch": 0.8917440529194759, + "grad_norm": 1.5110807418823242, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8670808672904968, + "num_tokens": 267333612.0, + "step": 7010 + }, + { + "epoch": 0.8918712631980664, + "grad_norm": 1.426131248474121, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8755909204483032, + "num_tokens": 267374036.0, + "step": 7011 + }, + { + "epoch": 0.891998473476657, + "grad_norm": 1.5841960906982422, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8629593849182129, + "num_tokens": 267407453.0, + "step": 7012 + }, + { + "epoch": 0.8921256837552475, + "grad_norm": 1.5169543027877808, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8697167038917542, + "num_tokens": 267440996.0, + "step": 7013 + }, + { + "epoch": 0.8922528940338379, + "grad_norm": 1.5381942987442017, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.862396240234375, + "num_tokens": 267480116.0, + "step": 7014 + }, + { + "epoch": 0.8923801043124284, + "grad_norm": 1.4230870008468628, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8604297637939453, + "num_tokens": 267523638.0, + "step": 7015 + }, + { + "epoch": 0.892507314591019, + "grad_norm": 1.3774880170822144, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8709691166877747, + "num_tokens": 267564069.0, + "step": 7016 + }, + { + "epoch": 0.8926345248696095, + "grad_norm": 1.4391216039657593, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8805316090583801, + "num_tokens": 267601415.0, + "step": 7017 + }, + { + "epoch": 0.8927617351482, + "grad_norm": 1.429512858390808, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8679682612419128, + "num_tokens": 267641333.0, + "step": 7018 + }, + { + "epoch": 0.8928889454267905, + "grad_norm": 1.4575554132461548, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8674619793891907, + "num_tokens": 267680730.0, + "step": 7019 + }, + { + "epoch": 0.893016155705381, + "grad_norm": 1.5998631715774536, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8536524772644043, + "num_tokens": 267719179.0, + "step": 7020 + }, + { + "epoch": 0.8931433659839715, + "grad_norm": 1.4055283069610596, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8690852522850037, + "num_tokens": 267760394.0, + "step": 7021 + }, + { + "epoch": 0.893270576262562, + "grad_norm": 1.4102158546447754, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8683133125305176, + "num_tokens": 267799399.0, + "step": 7022 + }, + { + "epoch": 0.8933977865411525, + "grad_norm": 1.6480506658554077, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8565665483474731, + "num_tokens": 267830936.0, + "step": 7023 + }, + { + "epoch": 0.8935249968197431, + "grad_norm": 1.4374510049819946, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8583037853240967, + "num_tokens": 267874764.0, + "step": 7024 + }, + { + "epoch": 0.8936522070983336, + "grad_norm": 1.3993067741394043, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8660381436347961, + "num_tokens": 267915137.0, + "step": 7025 + }, + { + "epoch": 0.893779417376924, + "grad_norm": 1.440146565437317, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8575328588485718, + "num_tokens": 267958328.0, + "step": 7026 + }, + { + "epoch": 0.8939066276555145, + "grad_norm": 1.5431978702545166, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8705311417579651, + "num_tokens": 267992517.0, + "step": 7027 + }, + { + "epoch": 0.8940338379341051, + "grad_norm": 1.570547103881836, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8802647590637207, + "num_tokens": 268023133.0, + "step": 7028 + }, + { + "epoch": 0.8941610482126956, + "grad_norm": 1.532631516456604, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8684695363044739, + "num_tokens": 268062935.0, + "step": 7029 + }, + { + "epoch": 0.8942882584912861, + "grad_norm": 1.436261773109436, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8844879269599915, + "num_tokens": 268098815.0, + "step": 7030 + }, + { + "epoch": 0.8944154687698767, + "grad_norm": 1.4202362298965454, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8499301671981812, + "num_tokens": 268144256.0, + "step": 7031 + }, + { + "epoch": 0.8945426790484671, + "grad_norm": 1.4116861820220947, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8691733479499817, + "num_tokens": 268184437.0, + "step": 7032 + }, + { + "epoch": 0.8946698893270576, + "grad_norm": 1.4161936044692993, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8610973358154297, + "num_tokens": 268227158.0, + "step": 7033 + }, + { + "epoch": 0.8947970996056481, + "grad_norm": 1.3991296291351318, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8720619082450867, + "num_tokens": 268266360.0, + "step": 7034 + }, + { + "epoch": 0.8949243098842387, + "grad_norm": 1.6415119171142578, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8678642511367798, + "num_tokens": 268300760.0, + "step": 7035 + }, + { + "epoch": 0.8950515201628292, + "grad_norm": 1.5791049003601074, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8640714287757874, + "num_tokens": 268338766.0, + "step": 7036 + }, + { + "epoch": 0.8951787304414197, + "grad_norm": 1.6602556705474854, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8579024076461792, + "num_tokens": 268372013.0, + "step": 7037 + }, + { + "epoch": 0.8953059407200101, + "grad_norm": 1.4728015661239624, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.874305248260498, + "num_tokens": 268410851.0, + "step": 7038 + }, + { + "epoch": 0.8954331509986007, + "grad_norm": 1.470115065574646, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8886198997497559, + "num_tokens": 268444816.0, + "step": 7039 + }, + { + "epoch": 0.8955603612771912, + "grad_norm": 1.5579477548599243, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8661137819290161, + "num_tokens": 268487640.0, + "step": 7040 + }, + { + "epoch": 0.8956875715557817, + "grad_norm": 1.6570672988891602, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8770405054092407, + "num_tokens": 268520184.0, + "step": 7041 + }, + { + "epoch": 0.8958147818343722, + "grad_norm": 1.446921467781067, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8739732503890991, + "num_tokens": 268557166.0, + "step": 7042 + }, + { + "epoch": 0.8959419921129628, + "grad_norm": 1.5772377252578735, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8629303574562073, + "num_tokens": 268593799.0, + "step": 7043 + }, + { + "epoch": 0.8960692023915532, + "grad_norm": 1.5057258605957031, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.865468442440033, + "num_tokens": 268632905.0, + "step": 7044 + }, + { + "epoch": 0.8961964126701437, + "grad_norm": 1.5479044914245605, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8733391761779785, + "num_tokens": 268667938.0, + "step": 7045 + }, + { + "epoch": 0.8963236229487342, + "grad_norm": 1.539312481880188, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.884394645690918, + "num_tokens": 268699816.0, + "step": 7046 + }, + { + "epoch": 0.8964508332273248, + "grad_norm": 1.4838662147521973, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8620805740356445, + "num_tokens": 268738938.0, + "step": 7047 + }, + { + "epoch": 0.8965780435059153, + "grad_norm": 1.5502997636795044, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8464909791946411, + "num_tokens": 268779302.0, + "step": 7048 + }, + { + "epoch": 0.8967052537845058, + "grad_norm": 1.5706961154937744, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8837331533432007, + "num_tokens": 268812122.0, + "step": 7049 + }, + { + "epoch": 0.8968324640630962, + "grad_norm": 1.6398576498031616, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8557471036911011, + "num_tokens": 268845696.0, + "step": 7050 + }, + { + "epoch": 0.8969596743416868, + "grad_norm": 1.4713435173034668, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8671016097068787, + "num_tokens": 268883102.0, + "step": 7051 + }, + { + "epoch": 0.8970868846202773, + "grad_norm": 1.454861044883728, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8746587038040161, + "num_tokens": 268924043.0, + "step": 7052 + }, + { + "epoch": 0.8972140948988678, + "grad_norm": 1.4066648483276367, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8728028535842896, + "num_tokens": 268965955.0, + "step": 7053 + }, + { + "epoch": 0.8973413051774584, + "grad_norm": 1.50262451171875, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8641377091407776, + "num_tokens": 269003374.0, + "step": 7054 + }, + { + "epoch": 0.8974685154560489, + "grad_norm": 1.4015792608261108, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8651607036590576, + "num_tokens": 269048203.0, + "step": 7055 + }, + { + "epoch": 0.8975957257346393, + "grad_norm": 1.3465849161148071, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8903656005859375, + "num_tokens": 269085618.0, + "step": 7056 + }, + { + "epoch": 0.8977229360132298, + "grad_norm": 1.4526668787002563, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8496912717819214, + "num_tokens": 269125577.0, + "step": 7057 + }, + { + "epoch": 0.8978501462918204, + "grad_norm": 1.4060721397399902, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8891209363937378, + "num_tokens": 269164009.0, + "step": 7058 + }, + { + "epoch": 0.8979773565704109, + "grad_norm": 1.5271177291870117, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.861153244972229, + "num_tokens": 269200727.0, + "step": 7059 + }, + { + "epoch": 0.8981045668490014, + "grad_norm": 1.5016065835952759, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.861145555973053, + "num_tokens": 269238940.0, + "step": 7060 + }, + { + "epoch": 0.898231777127592, + "grad_norm": 1.4170458316802979, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8643590211868286, + "num_tokens": 269275624.0, + "step": 7061 + }, + { + "epoch": 0.8983589874061825, + "grad_norm": 1.595091700553894, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8710712194442749, + "num_tokens": 269310233.0, + "step": 7062 + }, + { + "epoch": 0.8984861976847729, + "grad_norm": 1.496074914932251, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8635307550430298, + "num_tokens": 269348118.0, + "step": 7063 + }, + { + "epoch": 0.8986134079633634, + "grad_norm": 1.4668824672698975, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8608596920967102, + "num_tokens": 269386442.0, + "step": 7064 + }, + { + "epoch": 0.898740618241954, + "grad_norm": 1.4313652515411377, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8598430156707764, + "num_tokens": 269429148.0, + "step": 7065 + }, + { + "epoch": 0.8988678285205445, + "grad_norm": 1.5358812808990479, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.850983202457428, + "num_tokens": 269471187.0, + "step": 7066 + }, + { + "epoch": 0.898995038799135, + "grad_norm": 1.3826113939285278, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8555762767791748, + "num_tokens": 269512461.0, + "step": 7067 + }, + { + "epoch": 0.8991222490777255, + "grad_norm": 1.455209732055664, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8763654828071594, + "num_tokens": 269548569.0, + "step": 7068 + }, + { + "epoch": 0.899249459356316, + "grad_norm": 1.4156173467636108, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8680046796798706, + "num_tokens": 269588950.0, + "step": 7069 + }, + { + "epoch": 0.8993766696349065, + "grad_norm": 1.4342951774597168, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8670885562896729, + "num_tokens": 269627313.0, + "step": 7070 + }, + { + "epoch": 0.899503879913497, + "grad_norm": 1.403110384941101, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.874714732170105, + "num_tokens": 269672773.0, + "step": 7071 + }, + { + "epoch": 0.8996310901920875, + "grad_norm": 1.467181921005249, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8851076364517212, + "num_tokens": 269708954.0, + "step": 7072 + }, + { + "epoch": 0.8997583004706781, + "grad_norm": 1.5710029602050781, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8585381507873535, + "num_tokens": 269743050.0, + "step": 7073 + }, + { + "epoch": 0.8998855107492686, + "grad_norm": 1.4274543523788452, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8783732652664185, + "num_tokens": 269777327.0, + "step": 7074 + }, + { + "epoch": 0.900012721027859, + "grad_norm": 1.5350315570831299, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8526847958564758, + "num_tokens": 269816598.0, + "step": 7075 + }, + { + "epoch": 0.9001399313064495, + "grad_norm": 1.2783801555633545, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8826464414596558, + "num_tokens": 269858332.0, + "step": 7076 + }, + { + "epoch": 0.9002671415850401, + "grad_norm": 1.4992990493774414, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8766787052154541, + "num_tokens": 269890797.0, + "step": 7077 + }, + { + "epoch": 0.9003943518636306, + "grad_norm": 1.5136675834655762, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8765882849693298, + "num_tokens": 269923366.0, + "step": 7078 + }, + { + "epoch": 0.9005215621422211, + "grad_norm": 1.4539519548416138, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8725990653038025, + "num_tokens": 269960179.0, + "step": 7079 + }, + { + "epoch": 0.9006487724208116, + "grad_norm": 1.5212199687957764, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8598131537437439, + "num_tokens": 269997369.0, + "step": 7080 + }, + { + "epoch": 0.9007759826994021, + "grad_norm": 1.4224339723587036, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8663833141326904, + "num_tokens": 270037072.0, + "step": 7081 + }, + { + "epoch": 0.9009031929779926, + "grad_norm": 1.497401475906372, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8640711307525635, + "num_tokens": 270075107.0, + "step": 7082 + }, + { + "epoch": 0.9010304032565831, + "grad_norm": 1.5844753980636597, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8404068946838379, + "num_tokens": 270115806.0, + "step": 7083 + }, + { + "epoch": 0.9011576135351737, + "grad_norm": 1.595216155052185, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8529878258705139, + "num_tokens": 270151413.0, + "step": 7084 + }, + { + "epoch": 0.9012848238137642, + "grad_norm": 1.4937247037887573, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8446732759475708, + "num_tokens": 270191407.0, + "step": 7085 + }, + { + "epoch": 0.9014120340923547, + "grad_norm": 1.4219777584075928, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8864149451255798, + "num_tokens": 270229068.0, + "step": 7086 + }, + { + "epoch": 0.9015392443709451, + "grad_norm": 1.4042352437973022, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8771079778671265, + "num_tokens": 270268101.0, + "step": 7087 + }, + { + "epoch": 0.9016664546495357, + "grad_norm": 1.560386300086975, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8626933693885803, + "num_tokens": 270303457.0, + "step": 7088 + }, + { + "epoch": 0.9017936649281262, + "grad_norm": 1.6200902462005615, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8798028230667114, + "num_tokens": 270337723.0, + "step": 7089 + }, + { + "epoch": 0.9019208752067167, + "grad_norm": 1.3699854612350464, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.875108003616333, + "num_tokens": 270380417.0, + "step": 7090 + }, + { + "epoch": 0.9020480854853072, + "grad_norm": 1.5398600101470947, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8688708543777466, + "num_tokens": 270421955.0, + "step": 7091 + }, + { + "epoch": 0.9021752957638978, + "grad_norm": 1.4231278896331787, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8674788475036621, + "num_tokens": 270460645.0, + "step": 7092 + }, + { + "epoch": 0.9023025060424882, + "grad_norm": 1.4723412990570068, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8818861246109009, + "num_tokens": 270494087.0, + "step": 7093 + }, + { + "epoch": 0.9024297163210787, + "grad_norm": 1.3948169946670532, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8791420459747314, + "num_tokens": 270533820.0, + "step": 7094 + }, + { + "epoch": 0.9025569265996692, + "grad_norm": 1.3003721237182617, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8887829184532166, + "num_tokens": 270578248.0, + "step": 7095 + }, + { + "epoch": 0.9026841368782598, + "grad_norm": 1.4002580642700195, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8573429584503174, + "num_tokens": 270619448.0, + "step": 7096 + }, + { + "epoch": 0.9028113471568503, + "grad_norm": 1.3759483098983765, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8779969811439514, + "num_tokens": 270659627.0, + "step": 7097 + }, + { + "epoch": 0.9029385574354408, + "grad_norm": 1.410328984260559, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8753327131271362, + "num_tokens": 270698111.0, + "step": 7098 + }, + { + "epoch": 0.9030657677140312, + "grad_norm": 1.39932119846344, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8738709092140198, + "num_tokens": 270735999.0, + "step": 7099 + }, + { + "epoch": 0.9031929779926218, + "grad_norm": 1.5904563665390015, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8514081835746765, + "num_tokens": 270770045.0, + "step": 7100 + }, + { + "epoch": 0.9033201882712123, + "grad_norm": 1.3796132802963257, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8730001449584961, + "num_tokens": 270813658.0, + "step": 7101 + }, + { + "epoch": 0.9034473985498028, + "grad_norm": 1.5146019458770752, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8632288575172424, + "num_tokens": 270851363.0, + "step": 7102 + }, + { + "epoch": 0.9035746088283934, + "grad_norm": 1.5741175413131714, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.866557240486145, + "num_tokens": 270886907.0, + "step": 7103 + }, + { + "epoch": 0.9037018191069839, + "grad_norm": 1.6370688676834106, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8616440296173096, + "num_tokens": 270922831.0, + "step": 7104 + }, + { + "epoch": 0.9038290293855743, + "grad_norm": 1.4938437938690186, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8798247575759888, + "num_tokens": 270957982.0, + "step": 7105 + }, + { + "epoch": 0.9039562396641648, + "grad_norm": 1.4202675819396973, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8601585626602173, + "num_tokens": 271003880.0, + "step": 7106 + }, + { + "epoch": 0.9040834499427554, + "grad_norm": 1.670384407043457, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.862960934638977, + "num_tokens": 271035133.0, + "step": 7107 + }, + { + "epoch": 0.9042106602213459, + "grad_norm": 1.3970531225204468, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8582069873809814, + "num_tokens": 271080158.0, + "step": 7108 + }, + { + "epoch": 0.9043378704999364, + "grad_norm": 1.335183024406433, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8778072595596313, + "num_tokens": 271125370.0, + "step": 7109 + }, + { + "epoch": 0.9044650807785269, + "grad_norm": 1.4632686376571655, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8713924884796143, + "num_tokens": 271161542.0, + "step": 7110 + }, + { + "epoch": 0.9045922910571175, + "grad_norm": 1.5081748962402344, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8662546277046204, + "num_tokens": 271198255.0, + "step": 7111 + }, + { + "epoch": 0.9047195013357079, + "grad_norm": 1.5601106882095337, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.862539529800415, + "num_tokens": 271234692.0, + "step": 7112 + }, + { + "epoch": 0.9048467116142984, + "grad_norm": 1.4087375402450562, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8823351860046387, + "num_tokens": 271273895.0, + "step": 7113 + }, + { + "epoch": 0.9049739218928889, + "grad_norm": 1.4779778718948364, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8731529712677002, + "num_tokens": 271308362.0, + "step": 7114 + }, + { + "epoch": 0.9051011321714795, + "grad_norm": 1.388305902481079, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8630985021591187, + "num_tokens": 271352163.0, + "step": 7115 + }, + { + "epoch": 0.90522834245007, + "grad_norm": 1.518970012664795, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8688457012176514, + "num_tokens": 271390980.0, + "step": 7116 + }, + { + "epoch": 0.9053555527286605, + "grad_norm": 1.390293836593628, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8799533843994141, + "num_tokens": 271430605.0, + "step": 7117 + }, + { + "epoch": 0.905482763007251, + "grad_norm": 1.5146689414978027, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8693045377731323, + "num_tokens": 271466811.0, + "step": 7118 + }, + { + "epoch": 0.9056099732858415, + "grad_norm": 1.4437294006347656, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8831940293312073, + "num_tokens": 271505931.0, + "step": 7119 + }, + { + "epoch": 0.905737183564432, + "grad_norm": 1.4054155349731445, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8771006464958191, + "num_tokens": 271544154.0, + "step": 7120 + }, + { + "epoch": 0.9058643938430225, + "grad_norm": 1.4070205688476562, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8643986582756042, + "num_tokens": 271585716.0, + "step": 7121 + }, + { + "epoch": 0.9059916041216131, + "grad_norm": 1.6288318634033203, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8489589691162109, + "num_tokens": 271623082.0, + "step": 7122 + }, + { + "epoch": 0.9061188144002036, + "grad_norm": 1.5572987794876099, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.853118896484375, + "num_tokens": 271657612.0, + "step": 7123 + }, + { + "epoch": 0.906246024678794, + "grad_norm": 1.4883111715316772, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8624682426452637, + "num_tokens": 271698109.0, + "step": 7124 + }, + { + "epoch": 0.9063732349573845, + "grad_norm": 1.4649304151535034, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8672940135002136, + "num_tokens": 271738506.0, + "step": 7125 + }, + { + "epoch": 0.9065004452359751, + "grad_norm": 1.3445088863372803, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8711125254631042, + "num_tokens": 271780911.0, + "step": 7126 + }, + { + "epoch": 0.9066276555145656, + "grad_norm": 1.553062915802002, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8621431589126587, + "num_tokens": 271817362.0, + "step": 7127 + }, + { + "epoch": 0.9067548657931561, + "grad_norm": 1.6258440017700195, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.860572338104248, + "num_tokens": 271852694.0, + "step": 7128 + }, + { + "epoch": 0.9068820760717466, + "grad_norm": 1.5275789499282837, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8711085319519043, + "num_tokens": 271888254.0, + "step": 7129 + }, + { + "epoch": 0.9070092863503371, + "grad_norm": 1.588618278503418, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8739830255508423, + "num_tokens": 271919000.0, + "step": 7130 + }, + { + "epoch": 0.9071364966289276, + "grad_norm": 1.383102297782898, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8614552021026611, + "num_tokens": 271961650.0, + "step": 7131 + }, + { + "epoch": 0.9072637069075181, + "grad_norm": 1.5400888919830322, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8638601303100586, + "num_tokens": 271999168.0, + "step": 7132 + }, + { + "epoch": 0.9073909171861086, + "grad_norm": 1.4425041675567627, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8671822547912598, + "num_tokens": 272038584.0, + "step": 7133 + }, + { + "epoch": 0.9075181274646992, + "grad_norm": 1.698668122291565, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.845654308795929, + "num_tokens": 272071584.0, + "step": 7134 + }, + { + "epoch": 0.9076453377432897, + "grad_norm": 1.6087661981582642, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8613570928573608, + "num_tokens": 272107359.0, + "step": 7135 + }, + { + "epoch": 0.9077725480218801, + "grad_norm": 1.5483571290969849, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8759897947311401, + "num_tokens": 272142308.0, + "step": 7136 + }, + { + "epoch": 0.9078997583004706, + "grad_norm": 1.516047716140747, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8409727215766907, + "num_tokens": 272180685.0, + "step": 7137 + }, + { + "epoch": 0.9080269685790612, + "grad_norm": 1.375727891921997, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8720410466194153, + "num_tokens": 272221323.0, + "step": 7138 + }, + { + "epoch": 0.9081541788576517, + "grad_norm": 1.537957787513733, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8765019178390503, + "num_tokens": 272260654.0, + "step": 7139 + }, + { + "epoch": 0.9082813891362422, + "grad_norm": 1.3768106698989868, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8944191932678223, + "num_tokens": 272297724.0, + "step": 7140 + }, + { + "epoch": 0.9084085994148328, + "grad_norm": 1.5369473695755005, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.852232813835144, + "num_tokens": 272336898.0, + "step": 7141 + }, + { + "epoch": 0.9085358096934232, + "grad_norm": 1.4497461318969727, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8681865930557251, + "num_tokens": 272375484.0, + "step": 7142 + }, + { + "epoch": 0.9086630199720137, + "grad_norm": 1.4100264310836792, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.869837760925293, + "num_tokens": 272416943.0, + "step": 7143 + }, + { + "epoch": 0.9087902302506042, + "grad_norm": 1.6085635423660278, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8549820184707642, + "num_tokens": 272451553.0, + "step": 7144 + }, + { + "epoch": 0.9089174405291948, + "grad_norm": 1.5162924528121948, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.86817866563797, + "num_tokens": 272487318.0, + "step": 7145 + }, + { + "epoch": 0.9090446508077853, + "grad_norm": 1.4756063222885132, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8756929636001587, + "num_tokens": 272524139.0, + "step": 7146 + }, + { + "epoch": 0.9091718610863758, + "grad_norm": 1.4483747482299805, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.875785231590271, + "num_tokens": 272557712.0, + "step": 7147 + }, + { + "epoch": 0.9092990713649662, + "grad_norm": 1.4830697774887085, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.855139434337616, + "num_tokens": 272599051.0, + "step": 7148 + }, + { + "epoch": 0.9094262816435568, + "grad_norm": 1.4964808225631714, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8575390577316284, + "num_tokens": 272641987.0, + "step": 7149 + }, + { + "epoch": 0.9095534919221473, + "grad_norm": 1.4410854578018188, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8706575036048889, + "num_tokens": 272683181.0, + "step": 7150 + }, + { + "epoch": 0.9096807022007378, + "grad_norm": 1.399213433265686, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8757089376449585, + "num_tokens": 272723340.0, + "step": 7151 + }, + { + "epoch": 0.9098079124793284, + "grad_norm": 1.5476112365722656, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8525172472000122, + "num_tokens": 272766621.0, + "step": 7152 + }, + { + "epoch": 0.9099351227579189, + "grad_norm": 1.6865026950836182, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8729973435401917, + "num_tokens": 272801174.0, + "step": 7153 + }, + { + "epoch": 0.9100623330365093, + "grad_norm": 1.6166119575500488, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8712196946144104, + "num_tokens": 272837651.0, + "step": 7154 + }, + { + "epoch": 0.9101895433150998, + "grad_norm": 1.369385838508606, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8637981414794922, + "num_tokens": 272878643.0, + "step": 7155 + }, + { + "epoch": 0.9103167535936904, + "grad_norm": 1.5685173273086548, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8658220767974854, + "num_tokens": 272916259.0, + "step": 7156 + }, + { + "epoch": 0.9104439638722809, + "grad_norm": 1.32156240940094, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.863823413848877, + "num_tokens": 272963722.0, + "step": 7157 + }, + { + "epoch": 0.9105711741508714, + "grad_norm": 1.4315567016601562, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8768056631088257, + "num_tokens": 273002682.0, + "step": 7158 + }, + { + "epoch": 0.9106983844294619, + "grad_norm": 1.5063233375549316, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8660315275192261, + "num_tokens": 273040028.0, + "step": 7159 + }, + { + "epoch": 0.9108255947080524, + "grad_norm": 1.5674734115600586, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8729235529899597, + "num_tokens": 273075503.0, + "step": 7160 + }, + { + "epoch": 0.9109528049866429, + "grad_norm": 1.6009160280227661, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8692512512207031, + "num_tokens": 273107394.0, + "step": 7161 + }, + { + "epoch": 0.9110800152652334, + "grad_norm": 1.5878140926361084, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8708562254905701, + "num_tokens": 273137717.0, + "step": 7162 + }, + { + "epoch": 0.9112072255438239, + "grad_norm": 1.5430177450180054, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.87543785572052, + "num_tokens": 273168852.0, + "step": 7163 + }, + { + "epoch": 0.9113344358224145, + "grad_norm": 1.4085201025009155, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8723985552787781, + "num_tokens": 273206087.0, + "step": 7164 + }, + { + "epoch": 0.911461646101005, + "grad_norm": 1.5149213075637817, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8625050783157349, + "num_tokens": 273245200.0, + "step": 7165 + }, + { + "epoch": 0.9115888563795955, + "grad_norm": 1.4307812452316284, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8793116807937622, + "num_tokens": 273283828.0, + "step": 7166 + }, + { + "epoch": 0.9117160666581859, + "grad_norm": 1.6918413639068604, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8719750046730042, + "num_tokens": 273314620.0, + "step": 7167 + }, + { + "epoch": 0.9118432769367765, + "grad_norm": 1.4533910751342773, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8564285635948181, + "num_tokens": 273357445.0, + "step": 7168 + }, + { + "epoch": 0.911970487215367, + "grad_norm": 1.377198576927185, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8736117482185364, + "num_tokens": 273397049.0, + "step": 7169 + }, + { + "epoch": 0.9120976974939575, + "grad_norm": 1.459420084953308, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8550819754600525, + "num_tokens": 273440342.0, + "step": 7170 + }, + { + "epoch": 0.912224907772548, + "grad_norm": 1.3692021369934082, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8736396431922913, + "num_tokens": 273482739.0, + "step": 7171 + }, + { + "epoch": 0.9123521180511386, + "grad_norm": 1.3730542659759521, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8606078624725342, + "num_tokens": 273525637.0, + "step": 7172 + }, + { + "epoch": 0.912479328329729, + "grad_norm": 1.4887782335281372, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8620896339416504, + "num_tokens": 273564169.0, + "step": 7173 + }, + { + "epoch": 0.9126065386083195, + "grad_norm": 1.402262568473816, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8585201501846313, + "num_tokens": 273606600.0, + "step": 7174 + }, + { + "epoch": 0.9127337488869101, + "grad_norm": 1.4146767854690552, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8731046319007874, + "num_tokens": 273645787.0, + "step": 7175 + }, + { + "epoch": 0.9128609591655006, + "grad_norm": 1.3080523014068604, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8829114437103271, + "num_tokens": 273687346.0, + "step": 7176 + }, + { + "epoch": 0.9129881694440911, + "grad_norm": 1.4154425859451294, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.877474308013916, + "num_tokens": 273723908.0, + "step": 7177 + }, + { + "epoch": 0.9131153797226816, + "grad_norm": 1.475407600402832, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8602437973022461, + "num_tokens": 273763951.0, + "step": 7178 + }, + { + "epoch": 0.9132425900012721, + "grad_norm": 1.4725910425186157, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8718565106391907, + "num_tokens": 273800573.0, + "step": 7179 + }, + { + "epoch": 0.9133698002798626, + "grad_norm": 1.4435362815856934, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8684257864952087, + "num_tokens": 273840484.0, + "step": 7180 + }, + { + "epoch": 0.9134970105584531, + "grad_norm": 1.4335081577301025, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8636733293533325, + "num_tokens": 273882382.0, + "step": 7181 + }, + { + "epoch": 0.9136242208370436, + "grad_norm": 1.5606005191802979, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8597061038017273, + "num_tokens": 273921594.0, + "step": 7182 + }, + { + "epoch": 0.9137514311156342, + "grad_norm": 1.446800708770752, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8652265071868896, + "num_tokens": 273960732.0, + "step": 7183 + }, + { + "epoch": 0.9138786413942247, + "grad_norm": 1.4868437051773071, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8832699060440063, + "num_tokens": 273994907.0, + "step": 7184 + }, + { + "epoch": 0.9140058516728151, + "grad_norm": 1.4967617988586426, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8768671154975891, + "num_tokens": 274032171.0, + "step": 7185 + }, + { + "epoch": 0.9141330619514056, + "grad_norm": 1.3071436882019043, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8881733417510986, + "num_tokens": 274072532.0, + "step": 7186 + }, + { + "epoch": 0.9142602722299962, + "grad_norm": 1.5124047994613647, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8619289398193359, + "num_tokens": 274110449.0, + "step": 7187 + }, + { + "epoch": 0.9143874825085867, + "grad_norm": 1.4220037460327148, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8721264600753784, + "num_tokens": 274148719.0, + "step": 7188 + }, + { + "epoch": 0.9145146927871772, + "grad_norm": 1.4063692092895508, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.867303729057312, + "num_tokens": 274191531.0, + "step": 7189 + }, + { + "epoch": 0.9146419030657678, + "grad_norm": 1.5077462196350098, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8762892484664917, + "num_tokens": 274226797.0, + "step": 7190 + }, + { + "epoch": 0.9147691133443582, + "grad_norm": 1.4721430540084839, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8732939958572388, + "num_tokens": 274261038.0, + "step": 7191 + }, + { + "epoch": 0.9148963236229487, + "grad_norm": 1.402853012084961, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8849452137947083, + "num_tokens": 274299822.0, + "step": 7192 + }, + { + "epoch": 0.9150235339015392, + "grad_norm": 1.508068561553955, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8809874653816223, + "num_tokens": 274331790.0, + "step": 7193 + }, + { + "epoch": 0.9151507441801298, + "grad_norm": 1.4376614093780518, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8794633150100708, + "num_tokens": 274370130.0, + "step": 7194 + }, + { + "epoch": 0.9152779544587203, + "grad_norm": 1.495898962020874, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8818986415863037, + "num_tokens": 274408976.0, + "step": 7195 + }, + { + "epoch": 0.9154051647373108, + "grad_norm": 1.5505815744400024, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8586089611053467, + "num_tokens": 274443550.0, + "step": 7196 + }, + { + "epoch": 0.9155323750159012, + "grad_norm": 1.6083093881607056, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8715712428092957, + "num_tokens": 274476903.0, + "step": 7197 + }, + { + "epoch": 0.9156595852944918, + "grad_norm": 1.534156322479248, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8775700330734253, + "num_tokens": 274510088.0, + "step": 7198 + }, + { + "epoch": 0.9157867955730823, + "grad_norm": 1.4921185970306396, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8720679879188538, + "num_tokens": 274549202.0, + "step": 7199 + }, + { + "epoch": 0.9159140058516728, + "grad_norm": 1.466776728630066, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8691622614860535, + "num_tokens": 274588975.0, + "step": 7200 + }, + { + "epoch": 0.9160412161302633, + "grad_norm": 1.4296321868896484, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8884273767471313, + "num_tokens": 274627245.0, + "step": 7201 + }, + { + "epoch": 0.9161684264088539, + "grad_norm": 1.5322932004928589, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8545655012130737, + "num_tokens": 274665450.0, + "step": 7202 + }, + { + "epoch": 0.9162956366874443, + "grad_norm": 1.502024531364441, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8631032109260559, + "num_tokens": 274703692.0, + "step": 7203 + }, + { + "epoch": 0.9164228469660348, + "grad_norm": 1.530632734298706, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8484014272689819, + "num_tokens": 274742559.0, + "step": 7204 + }, + { + "epoch": 0.9165500572446253, + "grad_norm": 1.6078054904937744, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.866658091545105, + "num_tokens": 274774414.0, + "step": 7205 + }, + { + "epoch": 0.9166772675232159, + "grad_norm": 1.5845156908035278, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8602141737937927, + "num_tokens": 274805909.0, + "step": 7206 + }, + { + "epoch": 0.9168044778018064, + "grad_norm": 1.4086211919784546, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8578599095344543, + "num_tokens": 274846176.0, + "step": 7207 + }, + { + "epoch": 0.9169316880803969, + "grad_norm": 1.4564950466156006, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8754168748855591, + "num_tokens": 274883067.0, + "step": 7208 + }, + { + "epoch": 0.9170588983589874, + "grad_norm": 1.708809733390808, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8683478832244873, + "num_tokens": 274918672.0, + "step": 7209 + }, + { + "epoch": 0.9171861086375779, + "grad_norm": 1.4457049369812012, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8681660890579224, + "num_tokens": 274957680.0, + "step": 7210 + }, + { + "epoch": 0.9173133189161684, + "grad_norm": 1.4061532020568848, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8730453252792358, + "num_tokens": 274996630.0, + "step": 7211 + }, + { + "epoch": 0.9174405291947589, + "grad_norm": 1.389148235321045, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8766797780990601, + "num_tokens": 275035868.0, + "step": 7212 + }, + { + "epoch": 0.9175677394733495, + "grad_norm": 1.5528430938720703, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8648878335952759, + "num_tokens": 275070608.0, + "step": 7213 + }, + { + "epoch": 0.91769494975194, + "grad_norm": 1.4090495109558105, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8825229406356812, + "num_tokens": 275112057.0, + "step": 7214 + }, + { + "epoch": 0.9178221600305305, + "grad_norm": 1.4960962533950806, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8771483302116394, + "num_tokens": 275150403.0, + "step": 7215 + }, + { + "epoch": 0.9179493703091209, + "grad_norm": 1.4840635061264038, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8639113306999207, + "num_tokens": 275189224.0, + "step": 7216 + }, + { + "epoch": 0.9180765805877115, + "grad_norm": 1.439399242401123, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8661712408065796, + "num_tokens": 275229511.0, + "step": 7217 + }, + { + "epoch": 0.918203790866302, + "grad_norm": 1.5634251832962036, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8547787666320801, + "num_tokens": 275268954.0, + "step": 7218 + }, + { + "epoch": 0.9183310011448925, + "grad_norm": 1.6278929710388184, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8565231561660767, + "num_tokens": 275304762.0, + "step": 7219 + }, + { + "epoch": 0.918458211423483, + "grad_norm": 1.4908396005630493, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8576730489730835, + "num_tokens": 275341492.0, + "step": 7220 + }, + { + "epoch": 0.9185854217020736, + "grad_norm": 1.4340094327926636, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8703752756118774, + "num_tokens": 275383488.0, + "step": 7221 + }, + { + "epoch": 0.918712631980664, + "grad_norm": 1.4959039688110352, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8595762848854065, + "num_tokens": 275424424.0, + "step": 7222 + }, + { + "epoch": 0.9188398422592545, + "grad_norm": 1.5488749742507935, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8761002421379089, + "num_tokens": 275457787.0, + "step": 7223 + }, + { + "epoch": 0.918967052537845, + "grad_norm": 1.3872367143630981, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8598612546920776, + "num_tokens": 275500451.0, + "step": 7224 + }, + { + "epoch": 0.9190942628164356, + "grad_norm": 1.420318841934204, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8731096982955933, + "num_tokens": 275538001.0, + "step": 7225 + }, + { + "epoch": 0.9192214730950261, + "grad_norm": 1.511419415473938, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8700457811355591, + "num_tokens": 275577573.0, + "step": 7226 + }, + { + "epoch": 0.9193486833736166, + "grad_norm": 1.5895406007766724, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8638914823532104, + "num_tokens": 275614552.0, + "step": 7227 + }, + { + "epoch": 0.919475893652207, + "grad_norm": 1.597844123840332, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.859876275062561, + "num_tokens": 275648878.0, + "step": 7228 + }, + { + "epoch": 0.9196031039307976, + "grad_norm": 1.5086519718170166, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.875478982925415, + "num_tokens": 275683207.0, + "step": 7229 + }, + { + "epoch": 0.9197303142093881, + "grad_norm": 1.677900791168213, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8523745536804199, + "num_tokens": 275717110.0, + "step": 7230 + }, + { + "epoch": 0.9198575244879786, + "grad_norm": 1.564168095588684, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8751204609870911, + "num_tokens": 275751969.0, + "step": 7231 + }, + { + "epoch": 0.9199847347665692, + "grad_norm": 1.3276615142822266, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8583964109420776, + "num_tokens": 275798995.0, + "step": 7232 + }, + { + "epoch": 0.9201119450451597, + "grad_norm": 1.4978845119476318, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8743941187858582, + "num_tokens": 275838744.0, + "step": 7233 + }, + { + "epoch": 0.9202391553237501, + "grad_norm": 1.4780791997909546, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8630561232566833, + "num_tokens": 275877951.0, + "step": 7234 + }, + { + "epoch": 0.9203663656023406, + "grad_norm": 1.5233795642852783, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.86662757396698, + "num_tokens": 275910446.0, + "step": 7235 + }, + { + "epoch": 0.9204935758809312, + "grad_norm": 1.4713271856307983, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.872401773929596, + "num_tokens": 275946847.0, + "step": 7236 + }, + { + "epoch": 0.9206207861595217, + "grad_norm": 1.4914069175720215, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8634839057922363, + "num_tokens": 275984634.0, + "step": 7237 + }, + { + "epoch": 0.9207479964381122, + "grad_norm": 1.4922250509262085, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8709347248077393, + "num_tokens": 276022846.0, + "step": 7238 + }, + { + "epoch": 0.9208752067167028, + "grad_norm": 1.4811071157455444, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8691380620002747, + "num_tokens": 276061693.0, + "step": 7239 + }, + { + "epoch": 0.9210024169952932, + "grad_norm": 1.5925997495651245, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8584579229354858, + "num_tokens": 276102602.0, + "step": 7240 + }, + { + "epoch": 0.9211296272738837, + "grad_norm": 1.4585450887680054, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8700157403945923, + "num_tokens": 276142328.0, + "step": 7241 + }, + { + "epoch": 0.9212568375524742, + "grad_norm": 1.3622483015060425, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8969058394432068, + "num_tokens": 276180362.0, + "step": 7242 + }, + { + "epoch": 0.9213840478310648, + "grad_norm": 1.4888505935668945, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8664374351501465, + "num_tokens": 276217759.0, + "step": 7243 + }, + { + "epoch": 0.9215112581096553, + "grad_norm": 1.4206156730651855, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8620839715003967, + "num_tokens": 276259133.0, + "step": 7244 + }, + { + "epoch": 0.9216384683882458, + "grad_norm": 1.4245768785476685, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8651958703994751, + "num_tokens": 276305124.0, + "step": 7245 + }, + { + "epoch": 0.9217656786668362, + "grad_norm": 1.516330599784851, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8634681105613708, + "num_tokens": 276340097.0, + "step": 7246 + }, + { + "epoch": 0.9218928889454268, + "grad_norm": 1.5967494249343872, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8603169918060303, + "num_tokens": 276373672.0, + "step": 7247 + }, + { + "epoch": 0.9220200992240173, + "grad_norm": 1.4628013372421265, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8723214268684387, + "num_tokens": 276413181.0, + "step": 7248 + }, + { + "epoch": 0.9221473095026078, + "grad_norm": 1.6074517965316772, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8610379695892334, + "num_tokens": 276447851.0, + "step": 7249 + }, + { + "epoch": 0.9222745197811983, + "grad_norm": 1.3985072374343872, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8652951717376709, + "num_tokens": 276493296.0, + "step": 7250 + }, + { + "epoch": 0.9224017300597889, + "grad_norm": 1.5633445978164673, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8837518692016602, + "num_tokens": 276528304.0, + "step": 7251 + }, + { + "epoch": 0.9225289403383793, + "grad_norm": 1.6771937608718872, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8613317012786865, + "num_tokens": 276558737.0, + "step": 7252 + }, + { + "epoch": 0.9226561506169698, + "grad_norm": 1.617569923400879, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8649322986602783, + "num_tokens": 276591575.0, + "step": 7253 + }, + { + "epoch": 0.9227833608955603, + "grad_norm": 1.54978609085083, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8643326759338379, + "num_tokens": 276626835.0, + "step": 7254 + }, + { + "epoch": 0.9229105711741509, + "grad_norm": 1.4584485292434692, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8746387958526611, + "num_tokens": 276664753.0, + "step": 7255 + }, + { + "epoch": 0.9230377814527414, + "grad_norm": 1.5725206136703491, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8606210947036743, + "num_tokens": 276704365.0, + "step": 7256 + }, + { + "epoch": 0.9231649917313319, + "grad_norm": 1.601471185684204, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8601057529449463, + "num_tokens": 276742833.0, + "step": 7257 + }, + { + "epoch": 0.9232922020099223, + "grad_norm": 1.4815664291381836, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8774755597114563, + "num_tokens": 276777954.0, + "step": 7258 + }, + { + "epoch": 0.9234194122885129, + "grad_norm": 1.463081955909729, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8620555400848389, + "num_tokens": 276815660.0, + "step": 7259 + }, + { + "epoch": 0.9235466225671034, + "grad_norm": 1.5589351654052734, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8791626691818237, + "num_tokens": 276847255.0, + "step": 7260 + }, + { + "epoch": 0.9236738328456939, + "grad_norm": 1.5400935411453247, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8544214963912964, + "num_tokens": 276884644.0, + "step": 7261 + }, + { + "epoch": 0.9238010431242845, + "grad_norm": 1.466428518295288, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8676514625549316, + "num_tokens": 276921169.0, + "step": 7262 + }, + { + "epoch": 0.923928253402875, + "grad_norm": 1.3486716747283936, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8753551244735718, + "num_tokens": 276963291.0, + "step": 7263 + }, + { + "epoch": 0.9240554636814655, + "grad_norm": 1.5935356616973877, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8629122376441956, + "num_tokens": 276997220.0, + "step": 7264 + }, + { + "epoch": 0.9241826739600559, + "grad_norm": 1.4439231157302856, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8512288331985474, + "num_tokens": 277037606.0, + "step": 7265 + }, + { + "epoch": 0.9243098842386465, + "grad_norm": 1.2710113525390625, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8852449059486389, + "num_tokens": 277084546.0, + "step": 7266 + }, + { + "epoch": 0.924437094517237, + "grad_norm": 1.5244637727737427, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8510620594024658, + "num_tokens": 277120704.0, + "step": 7267 + }, + { + "epoch": 0.9245643047958275, + "grad_norm": 1.477313756942749, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8720955848693848, + "num_tokens": 277158646.0, + "step": 7268 + }, + { + "epoch": 0.924691515074418, + "grad_norm": 1.6747980117797852, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.850860595703125, + "num_tokens": 277189931.0, + "step": 7269 + }, + { + "epoch": 0.9248187253530086, + "grad_norm": 1.3907806873321533, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8836296200752258, + "num_tokens": 277226410.0, + "step": 7270 + }, + { + "epoch": 0.924945935631599, + "grad_norm": 1.342957854270935, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8676486015319824, + "num_tokens": 277270111.0, + "step": 7271 + }, + { + "epoch": 0.9250731459101895, + "grad_norm": 1.4135777950286865, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8725314736366272, + "num_tokens": 277307906.0, + "step": 7272 + }, + { + "epoch": 0.92520035618878, + "grad_norm": 1.446460485458374, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8626648187637329, + "num_tokens": 277346353.0, + "step": 7273 + }, + { + "epoch": 0.9253275664673706, + "grad_norm": 1.545491337776184, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8759620189666748, + "num_tokens": 277381063.0, + "step": 7274 + }, + { + "epoch": 0.9254547767459611, + "grad_norm": 1.534477949142456, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8636882305145264, + "num_tokens": 277422043.0, + "step": 7275 + }, + { + "epoch": 0.9255819870245516, + "grad_norm": 1.4243656396865845, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8701306581497192, + "num_tokens": 277461539.0, + "step": 7276 + }, + { + "epoch": 0.925709197303142, + "grad_norm": 1.5883735418319702, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8679137229919434, + "num_tokens": 277493695.0, + "step": 7277 + }, + { + "epoch": 0.9258364075817326, + "grad_norm": 1.549770474433899, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8726311326026917, + "num_tokens": 277527848.0, + "step": 7278 + }, + { + "epoch": 0.9259636178603231, + "grad_norm": 1.500341773033142, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8672770857810974, + "num_tokens": 277564884.0, + "step": 7279 + }, + { + "epoch": 0.9260908281389136, + "grad_norm": 1.572564959526062, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8534810543060303, + "num_tokens": 277601161.0, + "step": 7280 + }, + { + "epoch": 0.9262180384175042, + "grad_norm": 1.4957120418548584, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8736604452133179, + "num_tokens": 277639446.0, + "step": 7281 + }, + { + "epoch": 0.9263452486960947, + "grad_norm": 1.4516961574554443, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8675711750984192, + "num_tokens": 277677017.0, + "step": 7282 + }, + { + "epoch": 0.9264724589746851, + "grad_norm": 1.3420989513397217, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8813198804855347, + "num_tokens": 277721306.0, + "step": 7283 + }, + { + "epoch": 0.9265996692532756, + "grad_norm": 1.3843032121658325, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8592065572738647, + "num_tokens": 277765360.0, + "step": 7284 + }, + { + "epoch": 0.9267268795318662, + "grad_norm": 1.4158923625946045, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8763298988342285, + "num_tokens": 277803244.0, + "step": 7285 + }, + { + "epoch": 0.9268540898104567, + "grad_norm": 1.453645944595337, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8612521886825562, + "num_tokens": 277846238.0, + "step": 7286 + }, + { + "epoch": 0.9269813000890472, + "grad_norm": 1.5115270614624023, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8717937469482422, + "num_tokens": 277880878.0, + "step": 7287 + }, + { + "epoch": 0.9271085103676378, + "grad_norm": 1.5456377267837524, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8763450980186462, + "num_tokens": 277916061.0, + "step": 7288 + }, + { + "epoch": 0.9272357206462282, + "grad_norm": 1.44228994846344, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8597668409347534, + "num_tokens": 277956613.0, + "step": 7289 + }, + { + "epoch": 0.9273629309248187, + "grad_norm": 1.4164091348648071, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8673546314239502, + "num_tokens": 277998851.0, + "step": 7290 + }, + { + "epoch": 0.9274901412034092, + "grad_norm": 1.3133559226989746, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8831261396408081, + "num_tokens": 278039441.0, + "step": 7291 + }, + { + "epoch": 0.9276173514819998, + "grad_norm": 1.3444161415100098, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8724814653396606, + "num_tokens": 278079113.0, + "step": 7292 + }, + { + "epoch": 0.9277445617605903, + "grad_norm": 1.4776118993759155, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8674747347831726, + "num_tokens": 278121322.0, + "step": 7293 + }, + { + "epoch": 0.9278717720391808, + "grad_norm": 1.4695378541946411, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8804737329483032, + "num_tokens": 278157139.0, + "step": 7294 + }, + { + "epoch": 0.9279989823177712, + "grad_norm": 1.665808081626892, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8662195205688477, + "num_tokens": 278189552.0, + "step": 7295 + }, + { + "epoch": 0.9281261925963618, + "grad_norm": 1.5389752388000488, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8632246255874634, + "num_tokens": 278223827.0, + "step": 7296 + }, + { + "epoch": 0.9282534028749523, + "grad_norm": 1.511406660079956, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8686747550964355, + "num_tokens": 278259457.0, + "step": 7297 + }, + { + "epoch": 0.9283806131535428, + "grad_norm": 1.5622109174728394, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8760932683944702, + "num_tokens": 278295302.0, + "step": 7298 + }, + { + "epoch": 0.9285078234321333, + "grad_norm": 1.4539508819580078, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.865559995174408, + "num_tokens": 278335762.0, + "step": 7299 + }, + { + "epoch": 0.9286350337107239, + "grad_norm": 1.448420763015747, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.870966374874115, + "num_tokens": 278376378.0, + "step": 7300 + }, + { + "epoch": 0.9287622439893143, + "grad_norm": 1.4547044038772583, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8583627939224243, + "num_tokens": 278417145.0, + "step": 7301 + }, + { + "epoch": 0.9288894542679048, + "grad_norm": 1.5586529970169067, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8582426905632019, + "num_tokens": 278452692.0, + "step": 7302 + }, + { + "epoch": 0.9290166645464953, + "grad_norm": 1.5063502788543701, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8643717765808105, + "num_tokens": 278490054.0, + "step": 7303 + }, + { + "epoch": 0.9291438748250859, + "grad_norm": 1.4914556741714478, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8702573776245117, + "num_tokens": 278528428.0, + "step": 7304 + }, + { + "epoch": 0.9292710851036764, + "grad_norm": 1.4568344354629517, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8549273014068604, + "num_tokens": 278567382.0, + "step": 7305 + }, + { + "epoch": 0.9293982953822669, + "grad_norm": 1.4478563070297241, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8598829507827759, + "num_tokens": 278607862.0, + "step": 7306 + }, + { + "epoch": 0.9295255056608573, + "grad_norm": 1.6293220520019531, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8478211164474487, + "num_tokens": 278644457.0, + "step": 7307 + }, + { + "epoch": 0.9296527159394479, + "grad_norm": 1.3845754861831665, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.890810489654541, + "num_tokens": 278681327.0, + "step": 7308 + }, + { + "epoch": 0.9297799262180384, + "grad_norm": 1.6409497261047363, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8668462634086609, + "num_tokens": 278716695.0, + "step": 7309 + }, + { + "epoch": 0.9299071364966289, + "grad_norm": 1.5386159420013428, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8608195781707764, + "num_tokens": 278754741.0, + "step": 7310 + }, + { + "epoch": 0.9300343467752195, + "grad_norm": 1.3619234561920166, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8807179927825928, + "num_tokens": 278796720.0, + "step": 7311 + }, + { + "epoch": 0.93016155705381, + "grad_norm": 1.572117567062378, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.872483491897583, + "num_tokens": 278830436.0, + "step": 7312 + }, + { + "epoch": 0.9302887673324005, + "grad_norm": 1.6257673501968384, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8479633927345276, + "num_tokens": 278864454.0, + "step": 7313 + }, + { + "epoch": 0.9304159776109909, + "grad_norm": 1.4814776182174683, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8710749745368958, + "num_tokens": 278905072.0, + "step": 7314 + }, + { + "epoch": 0.9305431878895815, + "grad_norm": 1.4435909986495972, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8520861864089966, + "num_tokens": 278948228.0, + "step": 7315 + }, + { + "epoch": 0.930670398168172, + "grad_norm": 1.5746467113494873, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8594639301300049, + "num_tokens": 278989583.0, + "step": 7316 + }, + { + "epoch": 0.9307976084467625, + "grad_norm": 1.3513997793197632, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8723047971725464, + "num_tokens": 279034793.0, + "step": 7317 + }, + { + "epoch": 0.930924818725353, + "grad_norm": 1.4013333320617676, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.86702561378479, + "num_tokens": 279073551.0, + "step": 7318 + }, + { + "epoch": 0.9310520290039436, + "grad_norm": 1.5356420278549194, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.839853048324585, + "num_tokens": 279115343.0, + "step": 7319 + }, + { + "epoch": 0.931179239282534, + "grad_norm": 1.3932148218154907, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8916348218917847, + "num_tokens": 279150476.0, + "step": 7320 + }, + { + "epoch": 0.9313064495611245, + "grad_norm": 1.5110269784927368, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8454803228378296, + "num_tokens": 279192169.0, + "step": 7321 + }, + { + "epoch": 0.931433659839715, + "grad_norm": 1.4468464851379395, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.871052622795105, + "num_tokens": 279230656.0, + "step": 7322 + }, + { + "epoch": 0.9315608701183056, + "grad_norm": 1.3997923135757446, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8816397786140442, + "num_tokens": 279268894.0, + "step": 7323 + }, + { + "epoch": 0.9316880803968961, + "grad_norm": 1.5064654350280762, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8606581687927246, + "num_tokens": 279308050.0, + "step": 7324 + }, + { + "epoch": 0.9318152906754866, + "grad_norm": 1.5388338565826416, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8640774488449097, + "num_tokens": 279340323.0, + "step": 7325 + }, + { + "epoch": 0.931942500954077, + "grad_norm": 1.5340747833251953, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8704640865325928, + "num_tokens": 279373002.0, + "step": 7326 + }, + { + "epoch": 0.9320697112326676, + "grad_norm": 1.4605228900909424, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.875763475894928, + "num_tokens": 279408511.0, + "step": 7327 + }, + { + "epoch": 0.9321969215112581, + "grad_norm": 1.4084211587905884, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8781566619873047, + "num_tokens": 279448066.0, + "step": 7328 + }, + { + "epoch": 0.9323241317898486, + "grad_norm": 1.4604452848434448, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8572523593902588, + "num_tokens": 279493332.0, + "step": 7329 + }, + { + "epoch": 0.9324513420684392, + "grad_norm": 1.4369699954986572, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8677432537078857, + "num_tokens": 279533944.0, + "step": 7330 + }, + { + "epoch": 0.9325785523470297, + "grad_norm": 1.5375062227249146, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8687583208084106, + "num_tokens": 279571950.0, + "step": 7331 + }, + { + "epoch": 0.9327057626256201, + "grad_norm": 1.5195544958114624, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8679931163787842, + "num_tokens": 279609275.0, + "step": 7332 + }, + { + "epoch": 0.9328329729042106, + "grad_norm": 1.5753872394561768, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8653258085250854, + "num_tokens": 279643939.0, + "step": 7333 + }, + { + "epoch": 0.9329601831828012, + "grad_norm": 1.4517006874084473, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8586225509643555, + "num_tokens": 279683857.0, + "step": 7334 + }, + { + "epoch": 0.9330873934613917, + "grad_norm": 1.4337091445922852, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8814643025398254, + "num_tokens": 279720819.0, + "step": 7335 + }, + { + "epoch": 0.9332146037399822, + "grad_norm": 1.4381892681121826, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8818941116333008, + "num_tokens": 279753732.0, + "step": 7336 + }, + { + "epoch": 0.9333418140185727, + "grad_norm": 1.4163144826889038, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8640496134757996, + "num_tokens": 279794964.0, + "step": 7337 + }, + { + "epoch": 0.9334690242971632, + "grad_norm": 1.5696877241134644, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8568450212478638, + "num_tokens": 279832249.0, + "step": 7338 + }, + { + "epoch": 0.9335962345757537, + "grad_norm": 1.6439762115478516, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8563761711120605, + "num_tokens": 279870673.0, + "step": 7339 + }, + { + "epoch": 0.9337234448543442, + "grad_norm": 1.4439349174499512, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8590580821037292, + "num_tokens": 279911980.0, + "step": 7340 + }, + { + "epoch": 0.9338506551329347, + "grad_norm": 1.6907269954681396, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.88725745677948, + "num_tokens": 279945583.0, + "step": 7341 + }, + { + "epoch": 0.9339778654115253, + "grad_norm": 1.404154896736145, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8695025444030762, + "num_tokens": 279987882.0, + "step": 7342 + }, + { + "epoch": 0.9341050756901158, + "grad_norm": 1.3982617855072021, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8727985620498657, + "num_tokens": 280030027.0, + "step": 7343 + }, + { + "epoch": 0.9342322859687062, + "grad_norm": 1.3314465284347534, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8781407475471497, + "num_tokens": 280072736.0, + "step": 7344 + }, + { + "epoch": 0.9343594962472968, + "grad_norm": 1.4096688032150269, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8537518382072449, + "num_tokens": 280117389.0, + "step": 7345 + }, + { + "epoch": 0.9344867065258873, + "grad_norm": 1.3732942342758179, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.877281129360199, + "num_tokens": 280158098.0, + "step": 7346 + }, + { + "epoch": 0.9346139168044778, + "grad_norm": 1.621309757232666, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8572035431861877, + "num_tokens": 280192669.0, + "step": 7347 + }, + { + "epoch": 0.9347411270830683, + "grad_norm": 1.4277923107147217, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8648866415023804, + "num_tokens": 280230507.0, + "step": 7348 + }, + { + "epoch": 0.9348683373616589, + "grad_norm": 1.4246790409088135, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8663648366928101, + "num_tokens": 280267336.0, + "step": 7349 + }, + { + "epoch": 0.9349955476402493, + "grad_norm": 1.402514100074768, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8658212423324585, + "num_tokens": 280309001.0, + "step": 7350 + }, + { + "epoch": 0.9351227579188398, + "grad_norm": 1.6170179843902588, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8575760126113892, + "num_tokens": 280343315.0, + "step": 7351 + }, + { + "epoch": 0.9352499681974303, + "grad_norm": 1.4633361101150513, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8696939945220947, + "num_tokens": 280381793.0, + "step": 7352 + }, + { + "epoch": 0.9353771784760209, + "grad_norm": 1.5939722061157227, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8495726585388184, + "num_tokens": 280419093.0, + "step": 7353 + }, + { + "epoch": 0.9355043887546114, + "grad_norm": 1.6505801677703857, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.86805659532547, + "num_tokens": 280452152.0, + "step": 7354 + }, + { + "epoch": 0.9356315990332019, + "grad_norm": 1.5983251333236694, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8676506876945496, + "num_tokens": 280483731.0, + "step": 7355 + }, + { + "epoch": 0.9357588093117923, + "grad_norm": 1.487095832824707, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8782694339752197, + "num_tokens": 280523286.0, + "step": 7356 + }, + { + "epoch": 0.9358860195903829, + "grad_norm": 1.3858866691589355, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8729142546653748, + "num_tokens": 280562195.0, + "step": 7357 + }, + { + "epoch": 0.9360132298689734, + "grad_norm": 1.470705509185791, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8798129558563232, + "num_tokens": 280595633.0, + "step": 7358 + }, + { + "epoch": 0.9361404401475639, + "grad_norm": 1.4428889751434326, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8626106381416321, + "num_tokens": 280634472.0, + "step": 7359 + }, + { + "epoch": 0.9362676504261545, + "grad_norm": 1.5178313255310059, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8575738668441772, + "num_tokens": 280673553.0, + "step": 7360 + }, + { + "epoch": 0.936394860704745, + "grad_norm": 1.47747802734375, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8932563066482544, + "num_tokens": 280705481.0, + "step": 7361 + }, + { + "epoch": 0.9365220709833355, + "grad_norm": 1.427889108657837, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8765292167663574, + "num_tokens": 280746894.0, + "step": 7362 + }, + { + "epoch": 0.9366492812619259, + "grad_norm": 1.3242902755737305, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.869629979133606, + "num_tokens": 280790759.0, + "step": 7363 + }, + { + "epoch": 0.9367764915405165, + "grad_norm": 1.3742549419403076, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8818255662918091, + "num_tokens": 280831137.0, + "step": 7364 + }, + { + "epoch": 0.936903701819107, + "grad_norm": 1.479288101196289, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8730854988098145, + "num_tokens": 280870391.0, + "step": 7365 + }, + { + "epoch": 0.9370309120976975, + "grad_norm": 1.4236844778060913, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8726956844329834, + "num_tokens": 280910707.0, + "step": 7366 + }, + { + "epoch": 0.937158122376288, + "grad_norm": 1.425917387008667, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8724030256271362, + "num_tokens": 280948330.0, + "step": 7367 + }, + { + "epoch": 0.9372853326548786, + "grad_norm": 1.4868208169937134, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8708222508430481, + "num_tokens": 280987432.0, + "step": 7368 + }, + { + "epoch": 0.937412542933469, + "grad_norm": 1.4851981401443481, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8642132878303528, + "num_tokens": 281025392.0, + "step": 7369 + }, + { + "epoch": 0.9375397532120595, + "grad_norm": 1.3945900201797485, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8738027811050415, + "num_tokens": 281066996.0, + "step": 7370 + }, + { + "epoch": 0.93766696349065, + "grad_norm": 1.357143759727478, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.873349666595459, + "num_tokens": 281111668.0, + "step": 7371 + }, + { + "epoch": 0.9377941737692406, + "grad_norm": 1.4818929433822632, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8700293302536011, + "num_tokens": 281149337.0, + "step": 7372 + }, + { + "epoch": 0.9379213840478311, + "grad_norm": 1.5346516370773315, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8765148520469666, + "num_tokens": 281182441.0, + "step": 7373 + }, + { + "epoch": 0.9380485943264216, + "grad_norm": 1.4522554874420166, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8663520812988281, + "num_tokens": 281224894.0, + "step": 7374 + }, + { + "epoch": 0.938175804605012, + "grad_norm": 1.5505309104919434, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8471308946609497, + "num_tokens": 281266087.0, + "step": 7375 + }, + { + "epoch": 0.9383030148836026, + "grad_norm": 1.404891848564148, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8670531511306763, + "num_tokens": 281309568.0, + "step": 7376 + }, + { + "epoch": 0.9384302251621931, + "grad_norm": 1.55661141872406, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8419378995895386, + "num_tokens": 281350128.0, + "step": 7377 + }, + { + "epoch": 0.9385574354407836, + "grad_norm": 1.478222370147705, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8532718420028687, + "num_tokens": 281389063.0, + "step": 7378 + }, + { + "epoch": 0.9386846457193742, + "grad_norm": 1.500569462776184, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8764920234680176, + "num_tokens": 281426945.0, + "step": 7379 + }, + { + "epoch": 0.9388118559979647, + "grad_norm": 1.3860750198364258, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.868895947933197, + "num_tokens": 281469009.0, + "step": 7380 + }, + { + "epoch": 0.9389390662765551, + "grad_norm": 1.4300599098205566, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8717866539955139, + "num_tokens": 281506347.0, + "step": 7381 + }, + { + "epoch": 0.9390662765551456, + "grad_norm": 1.660152792930603, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8817888498306274, + "num_tokens": 281539266.0, + "step": 7382 + }, + { + "epoch": 0.9391934868337362, + "grad_norm": 1.439695119857788, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8585070371627808, + "num_tokens": 281581421.0, + "step": 7383 + }, + { + "epoch": 0.9393206971123267, + "grad_norm": 1.36811101436615, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8749011754989624, + "num_tokens": 281618602.0, + "step": 7384 + }, + { + "epoch": 0.9394479073909172, + "grad_norm": 1.542544960975647, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8677324056625366, + "num_tokens": 281651514.0, + "step": 7385 + }, + { + "epoch": 0.9395751176695077, + "grad_norm": 1.4437659978866577, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8893995881080627, + "num_tokens": 281684308.0, + "step": 7386 + }, + { + "epoch": 0.9397023279480982, + "grad_norm": 1.3735660314559937, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8622987270355225, + "num_tokens": 281724478.0, + "step": 7387 + }, + { + "epoch": 0.9398295382266887, + "grad_norm": 1.4286555051803589, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8673887848854065, + "num_tokens": 281766055.0, + "step": 7388 + }, + { + "epoch": 0.9399567485052792, + "grad_norm": 1.468106746673584, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.860843300819397, + "num_tokens": 281803692.0, + "step": 7389 + }, + { + "epoch": 0.9400839587838697, + "grad_norm": 1.4562965631484985, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8777341842651367, + "num_tokens": 281842525.0, + "step": 7390 + }, + { + "epoch": 0.9402111690624603, + "grad_norm": 1.4299819469451904, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8776146173477173, + "num_tokens": 281881730.0, + "step": 7391 + }, + { + "epoch": 0.9403383793410508, + "grad_norm": 1.5193053483963013, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8716357350349426, + "num_tokens": 281919119.0, + "step": 7392 + }, + { + "epoch": 0.9404655896196412, + "grad_norm": 1.4628376960754395, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8479009866714478, + "num_tokens": 281959016.0, + "step": 7393 + }, + { + "epoch": 0.9405927998982317, + "grad_norm": 1.575981616973877, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8777785897254944, + "num_tokens": 281989105.0, + "step": 7394 + }, + { + "epoch": 0.9407200101768223, + "grad_norm": 1.3856956958770752, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8685848712921143, + "num_tokens": 282030294.0, + "step": 7395 + }, + { + "epoch": 0.9408472204554128, + "grad_norm": 1.3333215713500977, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8701138496398926, + "num_tokens": 282072253.0, + "step": 7396 + }, + { + "epoch": 0.9409744307340033, + "grad_norm": 1.4169657230377197, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8678103685379028, + "num_tokens": 282110695.0, + "step": 7397 + }, + { + "epoch": 0.9411016410125939, + "grad_norm": 1.4225772619247437, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8789288997650146, + "num_tokens": 282149782.0, + "step": 7398 + }, + { + "epoch": 0.9412288512911843, + "grad_norm": 1.5045214891433716, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8709315657615662, + "num_tokens": 282187036.0, + "step": 7399 + }, + { + "epoch": 0.9413560615697748, + "grad_norm": 1.4936587810516357, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8717859983444214, + "num_tokens": 282224450.0, + "step": 7400 + }, + { + "epoch": 0.9414832718483653, + "grad_norm": 1.420327067375183, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8518847227096558, + "num_tokens": 282267145.0, + "step": 7401 + }, + { + "epoch": 0.9416104821269559, + "grad_norm": 1.5258769989013672, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8506495952606201, + "num_tokens": 282305507.0, + "step": 7402 + }, + { + "epoch": 0.9417376924055464, + "grad_norm": 1.5202605724334717, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8594817519187927, + "num_tokens": 282341637.0, + "step": 7403 + }, + { + "epoch": 0.9418649026841369, + "grad_norm": 1.4794542789459229, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8544661998748779, + "num_tokens": 282381743.0, + "step": 7404 + }, + { + "epoch": 0.9419921129627273, + "grad_norm": 1.592885136604309, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8625115156173706, + "num_tokens": 282415887.0, + "step": 7405 + }, + { + "epoch": 0.9421193232413179, + "grad_norm": 1.3725956678390503, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.884429931640625, + "num_tokens": 282455356.0, + "step": 7406 + }, + { + "epoch": 0.9422465335199084, + "grad_norm": 1.5279263257980347, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.871646523475647, + "num_tokens": 282492674.0, + "step": 7407 + }, + { + "epoch": 0.9423737437984989, + "grad_norm": 1.452553629875183, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8747533559799194, + "num_tokens": 282535863.0, + "step": 7408 + }, + { + "epoch": 0.9425009540770894, + "grad_norm": 1.493653416633606, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8530223369598389, + "num_tokens": 282576371.0, + "step": 7409 + }, + { + "epoch": 0.94262816435568, + "grad_norm": 1.3876943588256836, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8815333247184753, + "num_tokens": 282612978.0, + "step": 7410 + }, + { + "epoch": 0.9427553746342705, + "grad_norm": 1.4278472661972046, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8732998371124268, + "num_tokens": 282648458.0, + "step": 7411 + }, + { + "epoch": 0.9428825849128609, + "grad_norm": 1.435798168182373, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8653982281684875, + "num_tokens": 282690724.0, + "step": 7412 + }, + { + "epoch": 0.9430097951914514, + "grad_norm": 1.4604449272155762, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8639954328536987, + "num_tokens": 282730817.0, + "step": 7413 + }, + { + "epoch": 0.943137005470042, + "grad_norm": 1.487899661064148, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8628849983215332, + "num_tokens": 282775468.0, + "step": 7414 + }, + { + "epoch": 0.9432642157486325, + "grad_norm": 1.4424352645874023, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8694131374359131, + "num_tokens": 282815415.0, + "step": 7415 + }, + { + "epoch": 0.943391426027223, + "grad_norm": 1.4704506397247314, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.86011803150177, + "num_tokens": 282855691.0, + "step": 7416 + }, + { + "epoch": 0.9435186363058136, + "grad_norm": 1.354343295097351, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8788620233535767, + "num_tokens": 282897134.0, + "step": 7417 + }, + { + "epoch": 0.943645846584404, + "grad_norm": 1.641645908355713, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8736361265182495, + "num_tokens": 282933178.0, + "step": 7418 + }, + { + "epoch": 0.9437730568629945, + "grad_norm": 1.3570983409881592, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8744156360626221, + "num_tokens": 282974471.0, + "step": 7419 + }, + { + "epoch": 0.943900267141585, + "grad_norm": 1.34589684009552, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8641548156738281, + "num_tokens": 283024014.0, + "step": 7420 + }, + { + "epoch": 0.9440274774201756, + "grad_norm": 1.4007480144500732, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.861233115196228, + "num_tokens": 283070047.0, + "step": 7421 + }, + { + "epoch": 0.9441546876987661, + "grad_norm": 1.541836142539978, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8769824504852295, + "num_tokens": 283107782.0, + "step": 7422 + }, + { + "epoch": 0.9442818979773566, + "grad_norm": 1.5519508123397827, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8664858341217041, + "num_tokens": 283141543.0, + "step": 7423 + }, + { + "epoch": 0.944409108255947, + "grad_norm": 1.3832731246948242, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8662387728691101, + "num_tokens": 283182595.0, + "step": 7424 + }, + { + "epoch": 0.9445363185345376, + "grad_norm": 1.5576239824295044, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8721837997436523, + "num_tokens": 283213811.0, + "step": 7425 + }, + { + "epoch": 0.9446635288131281, + "grad_norm": 1.5786939859390259, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8666123747825623, + "num_tokens": 283249054.0, + "step": 7426 + }, + { + "epoch": 0.9447907390917186, + "grad_norm": 1.4451230764389038, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8680897355079651, + "num_tokens": 283285746.0, + "step": 7427 + }, + { + "epoch": 0.9449179493703092, + "grad_norm": 1.5372288227081299, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8732608556747437, + "num_tokens": 283318958.0, + "step": 7428 + }, + { + "epoch": 0.9450451596488997, + "grad_norm": 1.416926622390747, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8649898171424866, + "num_tokens": 283361251.0, + "step": 7429 + }, + { + "epoch": 0.9451723699274901, + "grad_norm": 1.500867486000061, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8616772890090942, + "num_tokens": 283399063.0, + "step": 7430 + }, + { + "epoch": 0.9452995802060806, + "grad_norm": 1.45173978805542, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8698281645774841, + "num_tokens": 283437524.0, + "step": 7431 + }, + { + "epoch": 0.9454267904846712, + "grad_norm": 1.5541342496871948, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8807384371757507, + "num_tokens": 283467958.0, + "step": 7432 + }, + { + "epoch": 0.9455540007632617, + "grad_norm": 1.4088926315307617, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8826336860656738, + "num_tokens": 283505478.0, + "step": 7433 + }, + { + "epoch": 0.9456812110418522, + "grad_norm": 1.5407874584197998, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8559474945068359, + "num_tokens": 283540019.0, + "step": 7434 + }, + { + "epoch": 0.9458084213204427, + "grad_norm": 1.539727807044983, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8643536567687988, + "num_tokens": 283578966.0, + "step": 7435 + }, + { + "epoch": 0.9459356315990332, + "grad_norm": 1.3618738651275635, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8606523871421814, + "num_tokens": 283623715.0, + "step": 7436 + }, + { + "epoch": 0.9460628418776237, + "grad_norm": 1.4305654764175415, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8696426153182983, + "num_tokens": 283661094.0, + "step": 7437 + }, + { + "epoch": 0.9461900521562142, + "grad_norm": 1.4329088926315308, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8687117099761963, + "num_tokens": 283702566.0, + "step": 7438 + }, + { + "epoch": 0.9463172624348047, + "grad_norm": 1.3646608591079712, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8758988976478577, + "num_tokens": 283747720.0, + "step": 7439 + }, + { + "epoch": 0.9464444727133953, + "grad_norm": 1.3942526578903198, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8548595905303955, + "num_tokens": 283793144.0, + "step": 7440 + }, + { + "epoch": 0.9465716829919858, + "grad_norm": 1.6336458921432495, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.875146210193634, + "num_tokens": 283822570.0, + "step": 7441 + }, + { + "epoch": 0.9466988932705762, + "grad_norm": 1.4586857557296753, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8668334484100342, + "num_tokens": 283864451.0, + "step": 7442 + }, + { + "epoch": 0.9468261035491667, + "grad_norm": 1.4769178628921509, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8737888932228088, + "num_tokens": 283900056.0, + "step": 7443 + }, + { + "epoch": 0.9469533138277573, + "grad_norm": 1.539078950881958, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8736450672149658, + "num_tokens": 283935395.0, + "step": 7444 + }, + { + "epoch": 0.9470805241063478, + "grad_norm": 1.6515448093414307, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8527193069458008, + "num_tokens": 283969354.0, + "step": 7445 + }, + { + "epoch": 0.9472077343849383, + "grad_norm": 1.6752679347991943, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8779982328414917, + "num_tokens": 284001761.0, + "step": 7446 + }, + { + "epoch": 0.9473349446635289, + "grad_norm": 1.493749737739563, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8690516948699951, + "num_tokens": 284036934.0, + "step": 7447 + }, + { + "epoch": 0.9474621549421193, + "grad_norm": 1.409811019897461, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8642600774765015, + "num_tokens": 284079432.0, + "step": 7448 + }, + { + "epoch": 0.9475893652207098, + "grad_norm": 1.2788764238357544, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8732262849807739, + "num_tokens": 284127741.0, + "step": 7449 + }, + { + "epoch": 0.9477165754993003, + "grad_norm": 1.592509150505066, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8725665211677551, + "num_tokens": 284158733.0, + "step": 7450 + }, + { + "epoch": 0.9478437857778909, + "grad_norm": 1.4445323944091797, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8515922427177429, + "num_tokens": 284200553.0, + "step": 7451 + }, + { + "epoch": 0.9479709960564814, + "grad_norm": 1.4067386388778687, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8825175762176514, + "num_tokens": 284237499.0, + "step": 7452 + }, + { + "epoch": 0.9480982063350719, + "grad_norm": 1.394118309020996, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8613932132720947, + "num_tokens": 284277482.0, + "step": 7453 + }, + { + "epoch": 0.9482254166136623, + "grad_norm": 1.5405843257904053, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8646731376647949, + "num_tokens": 284315382.0, + "step": 7454 + }, + { + "epoch": 0.9483526268922529, + "grad_norm": 1.5059773921966553, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8762509822845459, + "num_tokens": 284350340.0, + "step": 7455 + }, + { + "epoch": 0.9484798371708434, + "grad_norm": 1.4904245138168335, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8528773188591003, + "num_tokens": 284393525.0, + "step": 7456 + }, + { + "epoch": 0.9486070474494339, + "grad_norm": 1.4952605962753296, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.868689775466919, + "num_tokens": 284431028.0, + "step": 7457 + }, + { + "epoch": 0.9487342577280244, + "grad_norm": 1.3760491609573364, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8740360140800476, + "num_tokens": 284469482.0, + "step": 7458 + }, + { + "epoch": 0.948861468006615, + "grad_norm": 1.6564215421676636, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8675132989883423, + "num_tokens": 284499919.0, + "step": 7459 + }, + { + "epoch": 0.9489886782852054, + "grad_norm": 1.488020896911621, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8804125189781189, + "num_tokens": 284533435.0, + "step": 7460 + }, + { + "epoch": 0.9491158885637959, + "grad_norm": 1.6514604091644287, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8667388558387756, + "num_tokens": 284564743.0, + "step": 7461 + }, + { + "epoch": 0.9492430988423864, + "grad_norm": 1.5602409839630127, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8825874328613281, + "num_tokens": 284597705.0, + "step": 7462 + }, + { + "epoch": 0.949370309120977, + "grad_norm": 1.6528396606445312, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8675315380096436, + "num_tokens": 284631366.0, + "step": 7463 + }, + { + "epoch": 0.9494975193995675, + "grad_norm": 1.312027931213379, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8764172792434692, + "num_tokens": 284674772.0, + "step": 7464 + }, + { + "epoch": 0.949624729678158, + "grad_norm": 1.512791633605957, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8716006875038147, + "num_tokens": 284708811.0, + "step": 7465 + }, + { + "epoch": 0.9497519399567486, + "grad_norm": 1.3695772886276245, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8761501908302307, + "num_tokens": 284752787.0, + "step": 7466 + }, + { + "epoch": 0.949879150235339, + "grad_norm": 1.511398196220398, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8599251508712769, + "num_tokens": 284794231.0, + "step": 7467 + }, + { + "epoch": 0.9500063605139295, + "grad_norm": 1.4688456058502197, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8685785531997681, + "num_tokens": 284831510.0, + "step": 7468 + }, + { + "epoch": 0.95013357079252, + "grad_norm": 1.482709527015686, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8814533948898315, + "num_tokens": 284869309.0, + "step": 7469 + }, + { + "epoch": 0.9502607810711106, + "grad_norm": 1.4541105031967163, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8584887981414795, + "num_tokens": 284912395.0, + "step": 7470 + }, + { + "epoch": 0.9503879913497011, + "grad_norm": 1.4813436269760132, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8707391619682312, + "num_tokens": 284949138.0, + "step": 7471 + }, + { + "epoch": 0.9505152016282916, + "grad_norm": 1.4268568754196167, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8824522495269775, + "num_tokens": 284985777.0, + "step": 7472 + }, + { + "epoch": 0.950642411906882, + "grad_norm": 1.5383797883987427, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8854745030403137, + "num_tokens": 285019564.0, + "step": 7473 + }, + { + "epoch": 0.9507696221854726, + "grad_norm": 1.4640969038009644, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8775094151496887, + "num_tokens": 285060194.0, + "step": 7474 + }, + { + "epoch": 0.9508968324640631, + "grad_norm": 1.621827483177185, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.875653862953186, + "num_tokens": 285091096.0, + "step": 7475 + }, + { + "epoch": 0.9510240427426536, + "grad_norm": 1.523097038269043, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8706186413764954, + "num_tokens": 285124599.0, + "step": 7476 + }, + { + "epoch": 0.9511512530212441, + "grad_norm": 1.5978972911834717, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8672958016395569, + "num_tokens": 285161922.0, + "step": 7477 + }, + { + "epoch": 0.9512784632998347, + "grad_norm": 1.4090577363967896, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8637062311172485, + "num_tokens": 285201191.0, + "step": 7478 + }, + { + "epoch": 0.9514056735784251, + "grad_norm": 1.6272788047790527, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8689146041870117, + "num_tokens": 285234574.0, + "step": 7479 + }, + { + "epoch": 0.9515328838570156, + "grad_norm": 1.3628754615783691, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8702549338340759, + "num_tokens": 285275601.0, + "step": 7480 + }, + { + "epoch": 0.9516600941356061, + "grad_norm": 1.384420394897461, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8778184652328491, + "num_tokens": 285314210.0, + "step": 7481 + }, + { + "epoch": 0.9517873044141967, + "grad_norm": 1.4008839130401611, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8707906603813171, + "num_tokens": 285357690.0, + "step": 7482 + }, + { + "epoch": 0.9519145146927872, + "grad_norm": 1.500780463218689, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8639397025108337, + "num_tokens": 285396080.0, + "step": 7483 + }, + { + "epoch": 0.9520417249713777, + "grad_norm": 1.3383642435073853, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.868763267993927, + "num_tokens": 285438938.0, + "step": 7484 + }, + { + "epoch": 0.9521689352499682, + "grad_norm": 1.5684319734573364, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8783231973648071, + "num_tokens": 285475668.0, + "step": 7485 + }, + { + "epoch": 0.9522961455285587, + "grad_norm": 1.367458701133728, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8670293688774109, + "num_tokens": 285519747.0, + "step": 7486 + }, + { + "epoch": 0.9524233558071492, + "grad_norm": 1.5268367528915405, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8688122034072876, + "num_tokens": 285554171.0, + "step": 7487 + }, + { + "epoch": 0.9525505660857397, + "grad_norm": 1.4746507406234741, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8910682201385498, + "num_tokens": 285587165.0, + "step": 7488 + }, + { + "epoch": 0.9526777763643303, + "grad_norm": 1.4922224283218384, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8724400997161865, + "num_tokens": 285623366.0, + "step": 7489 + }, + { + "epoch": 0.9528049866429208, + "grad_norm": 1.4629275798797607, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8733802437782288, + "num_tokens": 285659305.0, + "step": 7490 + }, + { + "epoch": 0.9529321969215112, + "grad_norm": 1.5313451290130615, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8564832210540771, + "num_tokens": 285697471.0, + "step": 7491 + }, + { + "epoch": 0.9530594072001017, + "grad_norm": 1.5059192180633545, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8585761785507202, + "num_tokens": 285737611.0, + "step": 7492 + }, + { + "epoch": 0.9531866174786923, + "grad_norm": 1.4664210081100464, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8730431795120239, + "num_tokens": 285774757.0, + "step": 7493 + }, + { + "epoch": 0.9533138277572828, + "grad_norm": 1.5442301034927368, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8416618704795837, + "num_tokens": 285812915.0, + "step": 7494 + }, + { + "epoch": 0.9534410380358733, + "grad_norm": 1.4948827028274536, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8620619773864746, + "num_tokens": 285849039.0, + "step": 7495 + }, + { + "epoch": 0.9535682483144639, + "grad_norm": 1.4908196926116943, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8775855302810669, + "num_tokens": 285883623.0, + "step": 7496 + }, + { + "epoch": 0.9536954585930543, + "grad_norm": 1.629517912864685, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8569414615631104, + "num_tokens": 285918304.0, + "step": 7497 + }, + { + "epoch": 0.9538226688716448, + "grad_norm": 1.513006567955017, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8637553453445435, + "num_tokens": 285957063.0, + "step": 7498 + }, + { + "epoch": 0.9539498791502353, + "grad_norm": 1.3831298351287842, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8810506463050842, + "num_tokens": 285995560.0, + "step": 7499 + }, + { + "epoch": 0.9540770894288259, + "grad_norm": 1.5478591918945312, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8661400675773621, + "num_tokens": 286031562.0, + "step": 7500 + }, + { + "epoch": 0.9542042997074164, + "grad_norm": 1.4085211753845215, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.87401282787323, + "num_tokens": 286074963.0, + "step": 7501 + }, + { + "epoch": 0.9543315099860069, + "grad_norm": 1.312867283821106, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8766396045684814, + "num_tokens": 286119509.0, + "step": 7502 + }, + { + "epoch": 0.9544587202645973, + "grad_norm": 1.4160679578781128, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8785686492919922, + "num_tokens": 286155237.0, + "step": 7503 + }, + { + "epoch": 0.9545859305431879, + "grad_norm": 1.542073369026184, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8598777055740356, + "num_tokens": 286191514.0, + "step": 7504 + }, + { + "epoch": 0.9547131408217784, + "grad_norm": 1.486883282661438, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8649253845214844, + "num_tokens": 286229724.0, + "step": 7505 + }, + { + "epoch": 0.9548403511003689, + "grad_norm": 1.3667123317718506, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8807060718536377, + "num_tokens": 286269073.0, + "step": 7506 + }, + { + "epoch": 0.9549675613789594, + "grad_norm": 1.5085128545761108, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8740962147712708, + "num_tokens": 286301904.0, + "step": 7507 + }, + { + "epoch": 0.95509477165755, + "grad_norm": 1.5756968259811401, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8676441311836243, + "num_tokens": 286339685.0, + "step": 7508 + }, + { + "epoch": 0.9552219819361404, + "grad_norm": 1.606932282447815, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8663989305496216, + "num_tokens": 286377009.0, + "step": 7509 + }, + { + "epoch": 0.9553491922147309, + "grad_norm": 1.560651183128357, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8899027109146118, + "num_tokens": 286407261.0, + "step": 7510 + }, + { + "epoch": 0.9554764024933214, + "grad_norm": 1.5124764442443848, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8654000759124756, + "num_tokens": 286446894.0, + "step": 7511 + }, + { + "epoch": 0.955603612771912, + "grad_norm": 1.4737597703933716, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8702909350395203, + "num_tokens": 286482494.0, + "step": 7512 + }, + { + "epoch": 0.9557308230505025, + "grad_norm": 1.6019260883331299, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8558697700500488, + "num_tokens": 286517891.0, + "step": 7513 + }, + { + "epoch": 0.955858033329093, + "grad_norm": 1.5439311265945435, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8543533682823181, + "num_tokens": 286560631.0, + "step": 7514 + }, + { + "epoch": 0.9559852436076836, + "grad_norm": 1.547709345817566, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8699897527694702, + "num_tokens": 286599546.0, + "step": 7515 + }, + { + "epoch": 0.956112453886274, + "grad_norm": 1.5841683149337769, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8793001174926758, + "num_tokens": 286637803.0, + "step": 7516 + }, + { + "epoch": 0.9562396641648645, + "grad_norm": 1.3405038118362427, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8728693723678589, + "num_tokens": 286677811.0, + "step": 7517 + }, + { + "epoch": 0.956366874443455, + "grad_norm": 1.527108073234558, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8823709487915039, + "num_tokens": 286714367.0, + "step": 7518 + }, + { + "epoch": 0.9564940847220456, + "grad_norm": 1.4320625066757202, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8754999041557312, + "num_tokens": 286753241.0, + "step": 7519 + }, + { + "epoch": 0.9566212950006361, + "grad_norm": 1.6208513975143433, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8744857907295227, + "num_tokens": 286784599.0, + "step": 7520 + }, + { + "epoch": 0.9567485052792266, + "grad_norm": 1.3855938911437988, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8827939033508301, + "num_tokens": 286823589.0, + "step": 7521 + }, + { + "epoch": 0.956875715557817, + "grad_norm": 1.4149792194366455, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8624013066291809, + "num_tokens": 286866878.0, + "step": 7522 + }, + { + "epoch": 0.9570029258364076, + "grad_norm": 1.362587809562683, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8761551380157471, + "num_tokens": 286907451.0, + "step": 7523 + }, + { + "epoch": 0.9571301361149981, + "grad_norm": 1.4296363592147827, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8650568127632141, + "num_tokens": 286947142.0, + "step": 7524 + }, + { + "epoch": 0.9572573463935886, + "grad_norm": 1.6279278993606567, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8465664386749268, + "num_tokens": 286984916.0, + "step": 7525 + }, + { + "epoch": 0.9573845566721791, + "grad_norm": 1.4391227960586548, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8811770677566528, + "num_tokens": 287019251.0, + "step": 7526 + }, + { + "epoch": 0.9575117669507697, + "grad_norm": 1.6180081367492676, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8601505756378174, + "num_tokens": 287054092.0, + "step": 7527 + }, + { + "epoch": 0.9576389772293601, + "grad_norm": 1.4847332239151, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8798183798789978, + "num_tokens": 287092520.0, + "step": 7528 + }, + { + "epoch": 0.9577661875079506, + "grad_norm": 1.8757333755493164, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8633265495300293, + "num_tokens": 287121081.0, + "step": 7529 + }, + { + "epoch": 0.9578933977865411, + "grad_norm": 1.5720512866973877, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8908008337020874, + "num_tokens": 287154136.0, + "step": 7530 + }, + { + "epoch": 0.9580206080651317, + "grad_norm": 1.6336995363235474, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8863131999969482, + "num_tokens": 287183952.0, + "step": 7531 + }, + { + "epoch": 0.9581478183437222, + "grad_norm": 1.4894416332244873, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.877231240272522, + "num_tokens": 287219687.0, + "step": 7532 + }, + { + "epoch": 0.9582750286223127, + "grad_norm": 1.607282042503357, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8837692737579346, + "num_tokens": 287251689.0, + "step": 7533 + }, + { + "epoch": 0.9584022389009031, + "grad_norm": 1.7326602935791016, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8664983510971069, + "num_tokens": 287282890.0, + "step": 7534 + }, + { + "epoch": 0.9585294491794937, + "grad_norm": 1.4442483186721802, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8669017553329468, + "num_tokens": 287319284.0, + "step": 7535 + }, + { + "epoch": 0.9586566594580842, + "grad_norm": 1.5265251398086548, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8617789149284363, + "num_tokens": 287360443.0, + "step": 7536 + }, + { + "epoch": 0.9587838697366747, + "grad_norm": 1.5109004974365234, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8726903796195984, + "num_tokens": 287398251.0, + "step": 7537 + }, + { + "epoch": 0.9589110800152653, + "grad_norm": 1.388766884803772, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.874911367893219, + "num_tokens": 287438108.0, + "step": 7538 + }, + { + "epoch": 0.9590382902938558, + "grad_norm": 1.5380473136901855, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8776178359985352, + "num_tokens": 287473649.0, + "step": 7539 + }, + { + "epoch": 0.9591655005724462, + "grad_norm": 1.3801140785217285, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8810572028160095, + "num_tokens": 287515795.0, + "step": 7540 + }, + { + "epoch": 0.9592927108510367, + "grad_norm": 1.399464726448059, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8668734431266785, + "num_tokens": 287556653.0, + "step": 7541 + }, + { + "epoch": 0.9594199211296273, + "grad_norm": 1.5216615200042725, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8617075681686401, + "num_tokens": 287592869.0, + "step": 7542 + }, + { + "epoch": 0.9595471314082178, + "grad_norm": 1.4191795587539673, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8578575849533081, + "num_tokens": 287632999.0, + "step": 7543 + }, + { + "epoch": 0.9596743416868083, + "grad_norm": 1.4267326593399048, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8513675928115845, + "num_tokens": 287677692.0, + "step": 7544 + }, + { + "epoch": 0.9598015519653988, + "grad_norm": 1.5205163955688477, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8645555377006531, + "num_tokens": 287712766.0, + "step": 7545 + }, + { + "epoch": 0.9599287622439893, + "grad_norm": 1.442763328552246, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8731198310852051, + "num_tokens": 287753694.0, + "step": 7546 + }, + { + "epoch": 0.9600559725225798, + "grad_norm": 1.5753953456878662, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8653264045715332, + "num_tokens": 287787550.0, + "step": 7547 + }, + { + "epoch": 0.9601831828011703, + "grad_norm": 1.41436767578125, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8702574968338013, + "num_tokens": 287826610.0, + "step": 7548 + }, + { + "epoch": 0.9603103930797608, + "grad_norm": 1.5511716604232788, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8642624616622925, + "num_tokens": 287861066.0, + "step": 7549 + }, + { + "epoch": 0.9604376033583514, + "grad_norm": 1.653275966644287, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8687278628349304, + "num_tokens": 287903948.0, + "step": 7550 + }, + { + "epoch": 0.9605648136369419, + "grad_norm": 1.4068015813827515, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8837602138519287, + "num_tokens": 287943899.0, + "step": 7551 + }, + { + "epoch": 0.9606920239155323, + "grad_norm": 1.5290840864181519, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8705450892448425, + "num_tokens": 287979979.0, + "step": 7552 + }, + { + "epoch": 0.9608192341941229, + "grad_norm": 1.3634357452392578, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.882247805595398, + "num_tokens": 288018670.0, + "step": 7553 + }, + { + "epoch": 0.9609464444727134, + "grad_norm": 1.5637458562850952, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8763050436973572, + "num_tokens": 288056083.0, + "step": 7554 + }, + { + "epoch": 0.9610736547513039, + "grad_norm": 1.5383026599884033, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8739866018295288, + "num_tokens": 288093054.0, + "step": 7555 + }, + { + "epoch": 0.9612008650298944, + "grad_norm": 1.4411652088165283, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8652126789093018, + "num_tokens": 288135526.0, + "step": 7556 + }, + { + "epoch": 0.961328075308485, + "grad_norm": 1.428798794746399, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8673508167266846, + "num_tokens": 288178692.0, + "step": 7557 + }, + { + "epoch": 0.9614552855870754, + "grad_norm": 1.5805879831314087, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8658632636070251, + "num_tokens": 288211856.0, + "step": 7558 + }, + { + "epoch": 0.9615824958656659, + "grad_norm": 1.3398871421813965, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8768215179443359, + "num_tokens": 288255522.0, + "step": 7559 + }, + { + "epoch": 0.9617097061442564, + "grad_norm": 1.3844835758209229, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8769605755805969, + "num_tokens": 288291851.0, + "step": 7560 + }, + { + "epoch": 0.961836916422847, + "grad_norm": 1.5601329803466797, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8635355830192566, + "num_tokens": 288329360.0, + "step": 7561 + }, + { + "epoch": 0.9619641267014375, + "grad_norm": 1.4975694417953491, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8617958426475525, + "num_tokens": 288365059.0, + "step": 7562 + }, + { + "epoch": 0.962091336980028, + "grad_norm": 1.4403833150863647, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8760800361633301, + "num_tokens": 288404936.0, + "step": 7563 + }, + { + "epoch": 0.9622185472586186, + "grad_norm": 1.4679198265075684, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8609936237335205, + "num_tokens": 288443194.0, + "step": 7564 + }, + { + "epoch": 0.962345757537209, + "grad_norm": 1.5113368034362793, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.860662043094635, + "num_tokens": 288480857.0, + "step": 7565 + }, + { + "epoch": 0.9624729678157995, + "grad_norm": 1.3689905405044556, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8791759014129639, + "num_tokens": 288522077.0, + "step": 7566 + }, + { + "epoch": 0.96260017809439, + "grad_norm": 1.4450392723083496, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8765180706977844, + "num_tokens": 288559111.0, + "step": 7567 + }, + { + "epoch": 0.9627273883729806, + "grad_norm": 1.4841456413269043, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8758098483085632, + "num_tokens": 288594150.0, + "step": 7568 + }, + { + "epoch": 0.9628545986515711, + "grad_norm": 1.3789342641830444, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.868807315826416, + "num_tokens": 288640632.0, + "step": 7569 + }, + { + "epoch": 0.9629818089301616, + "grad_norm": 1.5943752527236938, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.870688796043396, + "num_tokens": 288675014.0, + "step": 7570 + }, + { + "epoch": 0.963109019208752, + "grad_norm": 1.454630732536316, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8594924211502075, + "num_tokens": 288717107.0, + "step": 7571 + }, + { + "epoch": 0.9632362294873426, + "grad_norm": 1.4343430995941162, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.877228856086731, + "num_tokens": 288757520.0, + "step": 7572 + }, + { + "epoch": 0.9633634397659331, + "grad_norm": 1.4126708507537842, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8691231608390808, + "num_tokens": 288798804.0, + "step": 7573 + }, + { + "epoch": 0.9634906500445236, + "grad_norm": 1.5109269618988037, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8575029373168945, + "num_tokens": 288834697.0, + "step": 7574 + }, + { + "epoch": 0.9636178603231141, + "grad_norm": 1.402042031288147, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8643555045127869, + "num_tokens": 288873857.0, + "step": 7575 + }, + { + "epoch": 0.9637450706017047, + "grad_norm": 1.4486174583435059, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8712897300720215, + "num_tokens": 288910350.0, + "step": 7576 + }, + { + "epoch": 0.9638722808802951, + "grad_norm": 1.625748872756958, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8666329383850098, + "num_tokens": 288945395.0, + "step": 7577 + }, + { + "epoch": 0.9639994911588856, + "grad_norm": 1.3252559900283813, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8800615668296814, + "num_tokens": 288989122.0, + "step": 7578 + }, + { + "epoch": 0.9641267014374761, + "grad_norm": 1.4877220392227173, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8739587068557739, + "num_tokens": 289027297.0, + "step": 7579 + }, + { + "epoch": 0.9642539117160667, + "grad_norm": 1.4590520858764648, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8783739805221558, + "num_tokens": 289063547.0, + "step": 7580 + }, + { + "epoch": 0.9643811219946572, + "grad_norm": 1.488226294517517, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8746567964553833, + "num_tokens": 289099392.0, + "step": 7581 + }, + { + "epoch": 0.9645083322732477, + "grad_norm": 1.6372381448745728, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8684581518173218, + "num_tokens": 289131990.0, + "step": 7582 + }, + { + "epoch": 0.9646355425518381, + "grad_norm": 1.6115822792053223, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8678869009017944, + "num_tokens": 289175614.0, + "step": 7583 + }, + { + "epoch": 0.9647627528304287, + "grad_norm": 1.5172399282455444, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8724184632301331, + "num_tokens": 289216454.0, + "step": 7584 + }, + { + "epoch": 0.9648899631090192, + "grad_norm": 1.514888048171997, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8786119222640991, + "num_tokens": 289248990.0, + "step": 7585 + }, + { + "epoch": 0.9650171733876097, + "grad_norm": 1.4903181791305542, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8696625232696533, + "num_tokens": 289287104.0, + "step": 7586 + }, + { + "epoch": 0.9651443836662003, + "grad_norm": 1.44446861743927, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8630068302154541, + "num_tokens": 289328306.0, + "step": 7587 + }, + { + "epoch": 0.9652715939447908, + "grad_norm": 1.5132131576538086, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.870023250579834, + "num_tokens": 289364396.0, + "step": 7588 + }, + { + "epoch": 0.9653988042233812, + "grad_norm": 1.4978880882263184, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8626004457473755, + "num_tokens": 289402040.0, + "step": 7589 + }, + { + "epoch": 0.9655260145019717, + "grad_norm": 1.6299911737442017, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8644607067108154, + "num_tokens": 289435346.0, + "step": 7590 + }, + { + "epoch": 0.9656532247805623, + "grad_norm": 1.6859495639801025, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8617620468139648, + "num_tokens": 289474848.0, + "step": 7591 + }, + { + "epoch": 0.9657804350591528, + "grad_norm": 1.404138207435608, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8725250363349915, + "num_tokens": 289516052.0, + "step": 7592 + }, + { + "epoch": 0.9659076453377433, + "grad_norm": 1.4842078685760498, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8635581731796265, + "num_tokens": 289554794.0, + "step": 7593 + }, + { + "epoch": 0.9660348556163338, + "grad_norm": 1.427567720413208, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8665592670440674, + "num_tokens": 289597733.0, + "step": 7594 + }, + { + "epoch": 0.9661620658949243, + "grad_norm": 1.4581003189086914, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8641457557678223, + "num_tokens": 289639604.0, + "step": 7595 + }, + { + "epoch": 0.9662892761735148, + "grad_norm": 1.5347390174865723, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8748887181282043, + "num_tokens": 289674358.0, + "step": 7596 + }, + { + "epoch": 0.9664164864521053, + "grad_norm": 1.3445394039154053, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8883322477340698, + "num_tokens": 289714577.0, + "step": 7597 + }, + { + "epoch": 0.9665436967306958, + "grad_norm": 1.5152277946472168, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8699204921722412, + "num_tokens": 289751662.0, + "step": 7598 + }, + { + "epoch": 0.9666709070092864, + "grad_norm": 1.375473141670227, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8828766345977783, + "num_tokens": 289791775.0, + "step": 7599 + }, + { + "epoch": 0.9667981172878769, + "grad_norm": 1.4608641862869263, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8591460585594177, + "num_tokens": 289831764.0, + "step": 7600 + }, + { + "epoch": 0.9669253275664673, + "grad_norm": 1.3847720623016357, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8724281787872314, + "num_tokens": 289872920.0, + "step": 7601 + }, + { + "epoch": 0.9670525378450578, + "grad_norm": 1.4283126592636108, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8582612872123718, + "num_tokens": 289913932.0, + "step": 7602 + }, + { + "epoch": 0.9671797481236484, + "grad_norm": 1.3228298425674438, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8711543083190918, + "num_tokens": 289956819.0, + "step": 7603 + }, + { + "epoch": 0.9673069584022389, + "grad_norm": 1.4837958812713623, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8605276346206665, + "num_tokens": 289995274.0, + "step": 7604 + }, + { + "epoch": 0.9674341686808294, + "grad_norm": 1.4705759286880493, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8573992848396301, + "num_tokens": 290036469.0, + "step": 7605 + }, + { + "epoch": 0.96756137895942, + "grad_norm": 1.4990876913070679, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8513806462287903, + "num_tokens": 290078696.0, + "step": 7606 + }, + { + "epoch": 0.9676885892380104, + "grad_norm": 1.7922415733337402, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8564587235450745, + "num_tokens": 290117085.0, + "step": 7607 + }, + { + "epoch": 0.9678157995166009, + "grad_norm": 1.3241547346115112, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8663710355758667, + "num_tokens": 290162075.0, + "step": 7608 + }, + { + "epoch": 0.9679430097951914, + "grad_norm": 1.5187346935272217, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8671116232872009, + "num_tokens": 290196855.0, + "step": 7609 + }, + { + "epoch": 0.968070220073782, + "grad_norm": 1.361680030822754, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8631969690322876, + "num_tokens": 290241875.0, + "step": 7610 + }, + { + "epoch": 0.9681974303523725, + "grad_norm": 1.3974766731262207, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8858641982078552, + "num_tokens": 290279815.0, + "step": 7611 + }, + { + "epoch": 0.968324640630963, + "grad_norm": 1.3851765394210815, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8677800297737122, + "num_tokens": 290321357.0, + "step": 7612 + }, + { + "epoch": 0.9684518509095535, + "grad_norm": 1.4660816192626953, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8616428375244141, + "num_tokens": 290361281.0, + "step": 7613 + }, + { + "epoch": 0.968579061188144, + "grad_norm": 1.4808834791183472, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8651725053787231, + "num_tokens": 290401654.0, + "step": 7614 + }, + { + "epoch": 0.9687062714667345, + "grad_norm": 1.5551323890686035, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8628525733947754, + "num_tokens": 290436272.0, + "step": 7615 + }, + { + "epoch": 0.968833481745325, + "grad_norm": 1.8560458421707153, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8688779473304749, + "num_tokens": 290476627.0, + "step": 7616 + }, + { + "epoch": 0.9689606920239155, + "grad_norm": 1.4867757558822632, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8604031801223755, + "num_tokens": 290519591.0, + "step": 7617 + }, + { + "epoch": 0.9690879023025061, + "grad_norm": 1.4997265338897705, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8602216839790344, + "num_tokens": 290559087.0, + "step": 7618 + }, + { + "epoch": 0.9692151125810966, + "grad_norm": 1.4881868362426758, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8578286170959473, + "num_tokens": 290597033.0, + "step": 7619 + }, + { + "epoch": 0.969342322859687, + "grad_norm": 1.516522765159607, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8544425964355469, + "num_tokens": 290636147.0, + "step": 7620 + }, + { + "epoch": 0.9694695331382776, + "grad_norm": 1.4295871257781982, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8731018304824829, + "num_tokens": 290678056.0, + "step": 7621 + }, + { + "epoch": 0.9695967434168681, + "grad_norm": 1.7404865026474, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8705323338508606, + "num_tokens": 290708241.0, + "step": 7622 + }, + { + "epoch": 0.9697239536954586, + "grad_norm": 1.4838258028030396, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8796638250350952, + "num_tokens": 290744287.0, + "step": 7623 + }, + { + "epoch": 0.9698511639740491, + "grad_norm": 1.4601575136184692, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8691807985305786, + "num_tokens": 290783047.0, + "step": 7624 + }, + { + "epoch": 0.9699783742526397, + "grad_norm": 1.565587043762207, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8852695822715759, + "num_tokens": 290823598.0, + "step": 7625 + }, + { + "epoch": 0.9701055845312301, + "grad_norm": 1.5404757261276245, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.875554621219635, + "num_tokens": 290859823.0, + "step": 7626 + }, + { + "epoch": 0.9702327948098206, + "grad_norm": 1.4481418132781982, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8527103662490845, + "num_tokens": 290901595.0, + "step": 7627 + }, + { + "epoch": 0.9703600050884111, + "grad_norm": 1.5104305744171143, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8541123867034912, + "num_tokens": 290940575.0, + "step": 7628 + }, + { + "epoch": 0.9704872153670017, + "grad_norm": 1.4677611589431763, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8780856132507324, + "num_tokens": 290976818.0, + "step": 7629 + }, + { + "epoch": 0.9706144256455922, + "grad_norm": 1.4443272352218628, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8766941428184509, + "num_tokens": 291014296.0, + "step": 7630 + }, + { + "epoch": 0.9707416359241827, + "grad_norm": 1.4125460386276245, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8761003017425537, + "num_tokens": 291051449.0, + "step": 7631 + }, + { + "epoch": 0.9708688462027731, + "grad_norm": 1.557455062866211, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8834478855133057, + "num_tokens": 291086803.0, + "step": 7632 + }, + { + "epoch": 0.9709960564813637, + "grad_norm": 1.518786072731018, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8560165166854858, + "num_tokens": 291125831.0, + "step": 7633 + }, + { + "epoch": 0.9711232667599542, + "grad_norm": 1.6642181873321533, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8499215841293335, + "num_tokens": 291158692.0, + "step": 7634 + }, + { + "epoch": 0.9712504770385447, + "grad_norm": 1.3662333488464355, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8654359579086304, + "num_tokens": 291199276.0, + "step": 7635 + }, + { + "epoch": 0.9713776873171353, + "grad_norm": 1.5768811702728271, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8593098521232605, + "num_tokens": 291233854.0, + "step": 7636 + }, + { + "epoch": 0.9715048975957258, + "grad_norm": 1.6539115905761719, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8587111234664917, + "num_tokens": 291267666.0, + "step": 7637 + }, + { + "epoch": 0.9716321078743162, + "grad_norm": 1.4487673044204712, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8854694366455078, + "num_tokens": 291306697.0, + "step": 7638 + }, + { + "epoch": 0.9717593181529067, + "grad_norm": 1.5596503019332886, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8706097602844238, + "num_tokens": 291339974.0, + "step": 7639 + }, + { + "epoch": 0.9718865284314973, + "grad_norm": 1.7388341426849365, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.85588139295578, + "num_tokens": 291373075.0, + "step": 7640 + }, + { + "epoch": 0.9720137387100878, + "grad_norm": 1.4162366390228271, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8692679405212402, + "num_tokens": 291413467.0, + "step": 7641 + }, + { + "epoch": 0.9721409489886783, + "grad_norm": 1.5170574188232422, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8705005049705505, + "num_tokens": 291448722.0, + "step": 7642 + }, + { + "epoch": 0.9722681592672688, + "grad_norm": 1.410388469696045, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8758211731910706, + "num_tokens": 291486715.0, + "step": 7643 + }, + { + "epoch": 0.9723953695458593, + "grad_norm": 1.5785287618637085, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8728401064872742, + "num_tokens": 291522351.0, + "step": 7644 + }, + { + "epoch": 0.9725225798244498, + "grad_norm": 1.6929682493209839, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8675141930580139, + "num_tokens": 291558622.0, + "step": 7645 + }, + { + "epoch": 0.9726497901030403, + "grad_norm": 1.4538624286651611, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8717210292816162, + "num_tokens": 291598355.0, + "step": 7646 + }, + { + "epoch": 0.9727770003816308, + "grad_norm": 1.4955332279205322, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8656478524208069, + "num_tokens": 291635573.0, + "step": 7647 + }, + { + "epoch": 0.9729042106602214, + "grad_norm": 1.4669902324676514, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8680330514907837, + "num_tokens": 291672838.0, + "step": 7648 + }, + { + "epoch": 0.9730314209388119, + "grad_norm": 1.427727222442627, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8683827519416809, + "num_tokens": 291710884.0, + "step": 7649 + }, + { + "epoch": 0.9731586312174023, + "grad_norm": 1.6476413011550903, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.874021053314209, + "num_tokens": 291748498.0, + "step": 7650 + }, + { + "epoch": 0.9732858414959928, + "grad_norm": 1.5133519172668457, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8739883303642273, + "num_tokens": 291785618.0, + "step": 7651 + }, + { + "epoch": 0.9734130517745834, + "grad_norm": 1.4276680946350098, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8712141513824463, + "num_tokens": 291826704.0, + "step": 7652 + }, + { + "epoch": 0.9735402620531739, + "grad_norm": 1.4942741394042969, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8654075860977173, + "num_tokens": 291862293.0, + "step": 7653 + }, + { + "epoch": 0.9736674723317644, + "grad_norm": 1.4630262851715088, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8619570732116699, + "num_tokens": 291899579.0, + "step": 7654 + }, + { + "epoch": 0.973794682610355, + "grad_norm": 1.5304101705551147, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8791184425354004, + "num_tokens": 291933801.0, + "step": 7655 + }, + { + "epoch": 0.9739218928889454, + "grad_norm": 1.6235582828521729, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8681607246398926, + "num_tokens": 291973427.0, + "step": 7656 + }, + { + "epoch": 0.9740491031675359, + "grad_norm": 1.619317650794983, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.878087043762207, + "num_tokens": 292006856.0, + "step": 7657 + }, + { + "epoch": 0.9741763134461264, + "grad_norm": 1.6387499570846558, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8647371530532837, + "num_tokens": 292045243.0, + "step": 7658 + }, + { + "epoch": 0.974303523724717, + "grad_norm": 1.3955214023590088, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.875119686126709, + "num_tokens": 292089962.0, + "step": 7659 + }, + { + "epoch": 0.9744307340033075, + "grad_norm": 1.3976123332977295, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8671849966049194, + "num_tokens": 292131722.0, + "step": 7660 + }, + { + "epoch": 0.974557944281898, + "grad_norm": 1.4161840677261353, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8539712429046631, + "num_tokens": 292176648.0, + "step": 7661 + }, + { + "epoch": 0.9746851545604885, + "grad_norm": 1.563705325126648, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8668079376220703, + "num_tokens": 292216843.0, + "step": 7662 + }, + { + "epoch": 0.974812364839079, + "grad_norm": 1.6477495431900024, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.863834023475647, + "num_tokens": 292255830.0, + "step": 7663 + }, + { + "epoch": 0.9749395751176695, + "grad_norm": 1.4872591495513916, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8703653812408447, + "num_tokens": 292296221.0, + "step": 7664 + }, + { + "epoch": 0.97506678539626, + "grad_norm": 1.5576659440994263, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8476738929748535, + "num_tokens": 292339307.0, + "step": 7665 + }, + { + "epoch": 0.9751939956748505, + "grad_norm": 1.2292540073394775, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8929104804992676, + "num_tokens": 292384390.0, + "step": 7666 + }, + { + "epoch": 0.9753212059534411, + "grad_norm": 1.5236883163452148, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8871380686759949, + "num_tokens": 292419251.0, + "step": 7667 + }, + { + "epoch": 0.9754484162320316, + "grad_norm": 1.5131683349609375, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.877261757850647, + "num_tokens": 292452003.0, + "step": 7668 + }, + { + "epoch": 0.975575626510622, + "grad_norm": 1.6658475399017334, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8735198974609375, + "num_tokens": 292492903.0, + "step": 7669 + }, + { + "epoch": 0.9757028367892125, + "grad_norm": 1.533337950706482, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8821749091148376, + "num_tokens": 292535589.0, + "step": 7670 + }, + { + "epoch": 0.9758300470678031, + "grad_norm": 1.8072727918624878, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8499186038970947, + "num_tokens": 292571152.0, + "step": 7671 + }, + { + "epoch": 0.9759572573463936, + "grad_norm": 1.5813099145889282, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8798022270202637, + "num_tokens": 292606944.0, + "step": 7672 + }, + { + "epoch": 0.9760844676249841, + "grad_norm": 1.468506097793579, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8529922962188721, + "num_tokens": 292647370.0, + "step": 7673 + }, + { + "epoch": 0.9762116779035747, + "grad_norm": 1.3545105457305908, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8825168609619141, + "num_tokens": 292685336.0, + "step": 7674 + }, + { + "epoch": 0.9763388881821651, + "grad_norm": 1.504502296447754, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8639304041862488, + "num_tokens": 292726974.0, + "step": 7675 + }, + { + "epoch": 0.9764660984607556, + "grad_norm": 1.6485214233398438, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8671479225158691, + "num_tokens": 292762586.0, + "step": 7676 + }, + { + "epoch": 0.9765933087393461, + "grad_norm": 1.5864611864089966, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8699558973312378, + "num_tokens": 292796050.0, + "step": 7677 + }, + { + "epoch": 0.9767205190179367, + "grad_norm": 1.549793004989624, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8841136693954468, + "num_tokens": 292830126.0, + "step": 7678 + }, + { + "epoch": 0.9768477292965272, + "grad_norm": 1.5270743370056152, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8727600574493408, + "num_tokens": 292867650.0, + "step": 7679 + }, + { + "epoch": 0.9769749395751177, + "grad_norm": 1.422734260559082, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8691096305847168, + "num_tokens": 292906445.0, + "step": 7680 + }, + { + "epoch": 0.9771021498537081, + "grad_norm": 1.4184592962265015, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8724980354309082, + "num_tokens": 292945749.0, + "step": 7681 + }, + { + "epoch": 0.9772293601322987, + "grad_norm": 1.4306371212005615, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8857327699661255, + "num_tokens": 292983407.0, + "step": 7682 + }, + { + "epoch": 0.9773565704108892, + "grad_norm": 1.4194042682647705, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8723948001861572, + "num_tokens": 293023362.0, + "step": 7683 + }, + { + "epoch": 0.9774837806894797, + "grad_norm": 1.6542397737503052, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8727859258651733, + "num_tokens": 293056253.0, + "step": 7684 + }, + { + "epoch": 0.9776109909680702, + "grad_norm": 1.4535530805587769, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8693245053291321, + "num_tokens": 293097394.0, + "step": 7685 + }, + { + "epoch": 0.9777382012466608, + "grad_norm": 1.5082656145095825, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8588246703147888, + "num_tokens": 293136875.0, + "step": 7686 + }, + { + "epoch": 0.9778654115252512, + "grad_norm": 1.4313076734542847, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8513506650924683, + "num_tokens": 293180326.0, + "step": 7687 + }, + { + "epoch": 0.9779926218038417, + "grad_norm": 1.5980764627456665, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8570981621742249, + "num_tokens": 293218938.0, + "step": 7688 + }, + { + "epoch": 0.9781198320824323, + "grad_norm": 1.5087711811065674, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8698387742042542, + "num_tokens": 293257397.0, + "step": 7689 + }, + { + "epoch": 0.9782470423610228, + "grad_norm": 1.5305908918380737, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8553022146224976, + "num_tokens": 293293541.0, + "step": 7690 + }, + { + "epoch": 0.9783742526396133, + "grad_norm": 1.4448634386062622, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.870021402835846, + "num_tokens": 293334650.0, + "step": 7691 + }, + { + "epoch": 0.9785014629182038, + "grad_norm": 1.501186490058899, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8653308153152466, + "num_tokens": 293372217.0, + "step": 7692 + }, + { + "epoch": 0.9786286731967943, + "grad_norm": 1.7007111310958862, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.848388135433197, + "num_tokens": 293407452.0, + "step": 7693 + }, + { + "epoch": 0.9787558834753848, + "grad_norm": 1.395101547241211, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8607503771781921, + "num_tokens": 293451751.0, + "step": 7694 + }, + { + "epoch": 0.9788830937539753, + "grad_norm": 1.510852336883545, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8533892631530762, + "num_tokens": 293497223.0, + "step": 7695 + }, + { + "epoch": 0.9790103040325658, + "grad_norm": 1.4870685338974, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8724920153617859, + "num_tokens": 293531448.0, + "step": 7696 + }, + { + "epoch": 0.9791375143111564, + "grad_norm": 1.4571195840835571, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8814723491668701, + "num_tokens": 293568216.0, + "step": 7697 + }, + { + "epoch": 0.9792647245897469, + "grad_norm": 1.373740315437317, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.864656925201416, + "num_tokens": 293612578.0, + "step": 7698 + }, + { + "epoch": 0.9793919348683373, + "grad_norm": 1.408947467803955, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8664181232452393, + "num_tokens": 293653839.0, + "step": 7699 + }, + { + "epoch": 0.9795191451469278, + "grad_norm": 1.7005077600479126, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8769117593765259, + "num_tokens": 293680651.0, + "step": 7700 + }, + { + "epoch": 0.9796463554255184, + "grad_norm": 1.700459599494934, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8641345500946045, + "num_tokens": 293713823.0, + "step": 7701 + }, + { + "epoch": 0.9797735657041089, + "grad_norm": 1.4661073684692383, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8669281005859375, + "num_tokens": 293752696.0, + "step": 7702 + }, + { + "epoch": 0.9799007759826994, + "grad_norm": 1.5642890930175781, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8593817949295044, + "num_tokens": 293790984.0, + "step": 7703 + }, + { + "epoch": 0.98002798626129, + "grad_norm": 1.4844613075256348, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8770076036453247, + "num_tokens": 293827770.0, + "step": 7704 + }, + { + "epoch": 0.9801551965398804, + "grad_norm": 1.5481427907943726, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8583053350448608, + "num_tokens": 293867230.0, + "step": 7705 + }, + { + "epoch": 0.9802824068184709, + "grad_norm": 1.6672667264938354, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8717290163040161, + "num_tokens": 293902990.0, + "step": 7706 + }, + { + "epoch": 0.9804096170970614, + "grad_norm": 1.4990462064743042, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8734164834022522, + "num_tokens": 293941826.0, + "step": 7707 + }, + { + "epoch": 0.980536827375652, + "grad_norm": 1.587296962738037, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.854934811592102, + "num_tokens": 293980084.0, + "step": 7708 + }, + { + "epoch": 0.9806640376542425, + "grad_norm": 1.37851083278656, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8730815052986145, + "num_tokens": 294021577.0, + "step": 7709 + }, + { + "epoch": 0.980791247932833, + "grad_norm": 1.4737643003463745, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8796771764755249, + "num_tokens": 294058968.0, + "step": 7710 + }, + { + "epoch": 0.9809184582114235, + "grad_norm": 1.3823262453079224, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8733634948730469, + "num_tokens": 294100819.0, + "step": 7711 + }, + { + "epoch": 0.981045668490014, + "grad_norm": 1.3807710409164429, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8807474970817566, + "num_tokens": 294137805.0, + "step": 7712 + }, + { + "epoch": 0.9811728787686045, + "grad_norm": 1.6067345142364502, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8789102435112, + "num_tokens": 294169732.0, + "step": 7713 + }, + { + "epoch": 0.981300089047195, + "grad_norm": 1.2895194292068481, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8717402219772339, + "num_tokens": 294216011.0, + "step": 7714 + }, + { + "epoch": 0.9814272993257855, + "grad_norm": 1.381551742553711, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8692420721054077, + "num_tokens": 294253992.0, + "step": 7715 + }, + { + "epoch": 0.9815545096043761, + "grad_norm": 1.5136576890945435, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8747249841690063, + "num_tokens": 294288739.0, + "step": 7716 + }, + { + "epoch": 0.9816817198829666, + "grad_norm": 1.3694543838500977, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.868645191192627, + "num_tokens": 294331620.0, + "step": 7717 + }, + { + "epoch": 0.981808930161557, + "grad_norm": 1.3871020078659058, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8521728515625, + "num_tokens": 294378045.0, + "step": 7718 + }, + { + "epoch": 0.9819361404401475, + "grad_norm": 1.3870084285736084, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8848050832748413, + "num_tokens": 294413934.0, + "step": 7719 + }, + { + "epoch": 0.9820633507187381, + "grad_norm": 1.428272008895874, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8759252429008484, + "num_tokens": 294451563.0, + "step": 7720 + }, + { + "epoch": 0.9821905609973286, + "grad_norm": 1.4372971057891846, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.863053560256958, + "num_tokens": 294493009.0, + "step": 7721 + }, + { + "epoch": 0.9823177712759191, + "grad_norm": 1.7895114421844482, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8654740452766418, + "num_tokens": 294522064.0, + "step": 7722 + }, + { + "epoch": 0.9824449815545097, + "grad_norm": 1.3691049814224243, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.857603907585144, + "num_tokens": 294568824.0, + "step": 7723 + }, + { + "epoch": 0.9825721918331001, + "grad_norm": 1.4256190061569214, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8740047216415405, + "num_tokens": 294612945.0, + "step": 7724 + }, + { + "epoch": 0.9826994021116906, + "grad_norm": 1.4449872970581055, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8495201468467712, + "num_tokens": 294655192.0, + "step": 7725 + }, + { + "epoch": 0.9828266123902811, + "grad_norm": 1.531900405883789, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8715870976448059, + "num_tokens": 294692789.0, + "step": 7726 + }, + { + "epoch": 0.9829538226688717, + "grad_norm": 1.3896269798278809, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8857128620147705, + "num_tokens": 294729903.0, + "step": 7727 + }, + { + "epoch": 0.9830810329474622, + "grad_norm": 1.500905990600586, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8767572045326233, + "num_tokens": 294770347.0, + "step": 7728 + }, + { + "epoch": 0.9832082432260527, + "grad_norm": 1.5290145874023438, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8516573905944824, + "num_tokens": 294810547.0, + "step": 7729 + }, + { + "epoch": 0.9833354535046431, + "grad_norm": 1.5165741443634033, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.85672926902771, + "num_tokens": 294848852.0, + "step": 7730 + }, + { + "epoch": 0.9834626637832337, + "grad_norm": 1.6094526052474976, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8638301491737366, + "num_tokens": 294883301.0, + "step": 7731 + }, + { + "epoch": 0.9835898740618242, + "grad_norm": 1.525969386100769, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8747918605804443, + "num_tokens": 294917349.0, + "step": 7732 + }, + { + "epoch": 0.9837170843404147, + "grad_norm": 1.4159194231033325, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8848782777786255, + "num_tokens": 294953586.0, + "step": 7733 + }, + { + "epoch": 0.9838442946190052, + "grad_norm": 1.519371509552002, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8570318222045898, + "num_tokens": 294992922.0, + "step": 7734 + }, + { + "epoch": 0.9839715048975958, + "grad_norm": 1.593280553817749, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8761413097381592, + "num_tokens": 295024091.0, + "step": 7735 + }, + { + "epoch": 0.9840987151761862, + "grad_norm": 1.3813061714172363, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8720195293426514, + "num_tokens": 295070288.0, + "step": 7736 + }, + { + "epoch": 0.9842259254547767, + "grad_norm": 1.4099812507629395, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8535610437393188, + "num_tokens": 295111750.0, + "step": 7737 + }, + { + "epoch": 0.9843531357333672, + "grad_norm": 1.4882670640945435, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8642885684967041, + "num_tokens": 295145118.0, + "step": 7738 + }, + { + "epoch": 0.9844803460119578, + "grad_norm": 1.4547619819641113, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8771423101425171, + "num_tokens": 295183514.0, + "step": 7739 + }, + { + "epoch": 0.9846075562905483, + "grad_norm": 1.4988235235214233, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8756539225578308, + "num_tokens": 295224701.0, + "step": 7740 + }, + { + "epoch": 0.9847347665691388, + "grad_norm": 1.5062824487686157, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8727836608886719, + "num_tokens": 295262906.0, + "step": 7741 + }, + { + "epoch": 0.9848619768477292, + "grad_norm": 1.557982087135315, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8685412406921387, + "num_tokens": 295296707.0, + "step": 7742 + }, + { + "epoch": 0.9849891871263198, + "grad_norm": 1.586743950843811, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8662070631980896, + "num_tokens": 295329891.0, + "step": 7743 + }, + { + "epoch": 0.9851163974049103, + "grad_norm": 1.4566174745559692, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8707348704338074, + "num_tokens": 295367093.0, + "step": 7744 + }, + { + "epoch": 0.9852436076835008, + "grad_norm": 1.5200592279434204, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8747414946556091, + "num_tokens": 295406592.0, + "step": 7745 + }, + { + "epoch": 0.9853708179620914, + "grad_norm": 1.4709408283233643, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8709397315979004, + "num_tokens": 295442796.0, + "step": 7746 + }, + { + "epoch": 0.9854980282406819, + "grad_norm": 1.5843560695648193, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8614640235900879, + "num_tokens": 295480960.0, + "step": 7747 + }, + { + "epoch": 0.9856252385192723, + "grad_norm": 1.4930739402770996, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8750268816947937, + "num_tokens": 295518164.0, + "step": 7748 + }, + { + "epoch": 0.9857524487978628, + "grad_norm": 1.460401177406311, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8823848962783813, + "num_tokens": 295553432.0, + "step": 7749 + }, + { + "epoch": 0.9858796590764534, + "grad_norm": 1.5329132080078125, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8726382255554199, + "num_tokens": 295587331.0, + "step": 7750 + }, + { + "epoch": 0.9860068693550439, + "grad_norm": 1.3435813188552856, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8669961094856262, + "num_tokens": 295630146.0, + "step": 7751 + }, + { + "epoch": 0.9861340796336344, + "grad_norm": 1.428236484527588, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8715100288391113, + "num_tokens": 295669992.0, + "step": 7752 + }, + { + "epoch": 0.986261289912225, + "grad_norm": 1.4170417785644531, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8758118748664856, + "num_tokens": 295712427.0, + "step": 7753 + }, + { + "epoch": 0.9863885001908154, + "grad_norm": 1.8150326013565063, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8723922967910767, + "num_tokens": 295744311.0, + "step": 7754 + }, + { + "epoch": 0.9865157104694059, + "grad_norm": 1.553191900253296, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8858895897865295, + "num_tokens": 295777877.0, + "step": 7755 + }, + { + "epoch": 0.9866429207479964, + "grad_norm": 1.5391952991485596, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8785957098007202, + "num_tokens": 295812523.0, + "step": 7756 + }, + { + "epoch": 0.986770131026587, + "grad_norm": 1.3976325988769531, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8642017841339111, + "num_tokens": 295854880.0, + "step": 7757 + }, + { + "epoch": 0.9868973413051775, + "grad_norm": 1.4382452964782715, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8646445274353027, + "num_tokens": 295900793.0, + "step": 7758 + }, + { + "epoch": 0.987024551583768, + "grad_norm": 1.4854909181594849, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8443435430526733, + "num_tokens": 295942799.0, + "step": 7759 + }, + { + "epoch": 0.9871517618623584, + "grad_norm": 1.513595700263977, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.86785888671875, + "num_tokens": 295984360.0, + "step": 7760 + }, + { + "epoch": 0.987278972140949, + "grad_norm": 1.3463612794876099, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8595949411392212, + "num_tokens": 296032187.0, + "step": 7761 + }, + { + "epoch": 0.9874061824195395, + "grad_norm": 1.549419641494751, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8596265316009521, + "num_tokens": 296067811.0, + "step": 7762 + }, + { + "epoch": 0.98753339269813, + "grad_norm": 1.5381883382797241, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8506062030792236, + "num_tokens": 296103905.0, + "step": 7763 + }, + { + "epoch": 0.9876606029767205, + "grad_norm": 1.516796588897705, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8630127310752869, + "num_tokens": 296140823.0, + "step": 7764 + }, + { + "epoch": 0.9877878132553111, + "grad_norm": 1.367207407951355, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8777939081192017, + "num_tokens": 296181644.0, + "step": 7765 + }, + { + "epoch": 0.9879150235339016, + "grad_norm": 1.4093410968780518, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8793516159057617, + "num_tokens": 296219523.0, + "step": 7766 + }, + { + "epoch": 0.988042233812492, + "grad_norm": 1.491333246231079, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.873192310333252, + "num_tokens": 296257934.0, + "step": 7767 + }, + { + "epoch": 0.9881694440910825, + "grad_norm": 1.5295814275741577, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.846644937992096, + "num_tokens": 296297539.0, + "step": 7768 + }, + { + "epoch": 0.9882966543696731, + "grad_norm": 1.2486088275909424, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8781375885009766, + "num_tokens": 296346047.0, + "step": 7769 + }, + { + "epoch": 0.9884238646482636, + "grad_norm": 1.3963935375213623, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8779550790786743, + "num_tokens": 296385852.0, + "step": 7770 + }, + { + "epoch": 0.9885510749268541, + "grad_norm": 1.414193034172058, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8787320852279663, + "num_tokens": 296423968.0, + "step": 7771 + }, + { + "epoch": 0.9886782852054447, + "grad_norm": 1.4920272827148438, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8664724826812744, + "num_tokens": 296464110.0, + "step": 7772 + }, + { + "epoch": 0.9888054954840351, + "grad_norm": 1.3962191343307495, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.873262882232666, + "num_tokens": 296502523.0, + "step": 7773 + }, + { + "epoch": 0.9889327057626256, + "grad_norm": 1.455271601676941, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8778315782546997, + "num_tokens": 296541080.0, + "step": 7774 + }, + { + "epoch": 0.9890599160412161, + "grad_norm": 1.439460277557373, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8614920377731323, + "num_tokens": 296583713.0, + "step": 7775 + }, + { + "epoch": 0.9891871263198067, + "grad_norm": 1.605926513671875, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8543710708618164, + "num_tokens": 296620487.0, + "step": 7776 + }, + { + "epoch": 0.9893143365983972, + "grad_norm": 1.3864837884902954, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8715016841888428, + "num_tokens": 296661154.0, + "step": 7777 + }, + { + "epoch": 0.9894415468769877, + "grad_norm": 1.518739938735962, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8784046173095703, + "num_tokens": 296693926.0, + "step": 7778 + }, + { + "epoch": 0.9895687571555781, + "grad_norm": 1.5184779167175293, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8913975954055786, + "num_tokens": 296725504.0, + "step": 7779 + }, + { + "epoch": 0.9896959674341687, + "grad_norm": 1.4686784744262695, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8842661380767822, + "num_tokens": 296760698.0, + "step": 7780 + }, + { + "epoch": 0.9898231777127592, + "grad_norm": 1.5050476789474487, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8661366105079651, + "num_tokens": 296802076.0, + "step": 7781 + }, + { + "epoch": 0.9899503879913497, + "grad_norm": 1.5726932287216187, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8727037310600281, + "num_tokens": 296839012.0, + "step": 7782 + }, + { + "epoch": 0.9900775982699402, + "grad_norm": 1.5603972673416138, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8749659061431885, + "num_tokens": 296871114.0, + "step": 7783 + }, + { + "epoch": 0.9902048085485308, + "grad_norm": 1.4902901649475098, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8821756839752197, + "num_tokens": 296904247.0, + "step": 7784 + }, + { + "epoch": 0.9903320188271212, + "grad_norm": 1.4264494180679321, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8633960485458374, + "num_tokens": 296948181.0, + "step": 7785 + }, + { + "epoch": 0.9904592291057117, + "grad_norm": 1.491960883140564, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8562430739402771, + "num_tokens": 296986976.0, + "step": 7786 + }, + { + "epoch": 0.9905864393843022, + "grad_norm": 1.443440318107605, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8697112798690796, + "num_tokens": 297026547.0, + "step": 7787 + }, + { + "epoch": 0.9907136496628928, + "grad_norm": 1.4166839122772217, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8776911497116089, + "num_tokens": 297067268.0, + "step": 7788 + }, + { + "epoch": 0.9908408599414833, + "grad_norm": 1.4458423852920532, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.86495041847229, + "num_tokens": 297104920.0, + "step": 7789 + }, + { + "epoch": 0.9909680702200738, + "grad_norm": 1.4725322723388672, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.871392548084259, + "num_tokens": 297140936.0, + "step": 7790 + }, + { + "epoch": 0.9910952804986642, + "grad_norm": 1.3492709398269653, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8792223930358887, + "num_tokens": 297182754.0, + "step": 7791 + }, + { + "epoch": 0.9912224907772548, + "grad_norm": 1.5107362270355225, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8618578910827637, + "num_tokens": 297226243.0, + "step": 7792 + }, + { + "epoch": 0.9913497010558453, + "grad_norm": 1.5955570936203003, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8711809515953064, + "num_tokens": 297262018.0, + "step": 7793 + }, + { + "epoch": 0.9914769113344358, + "grad_norm": 1.4869115352630615, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8792931437492371, + "num_tokens": 297297958.0, + "step": 7794 + }, + { + "epoch": 0.9916041216130264, + "grad_norm": 1.560214877128601, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8618208169937134, + "num_tokens": 297331892.0, + "step": 7795 + }, + { + "epoch": 0.9917313318916169, + "grad_norm": 1.4282327890396118, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8679969310760498, + "num_tokens": 297372970.0, + "step": 7796 + }, + { + "epoch": 0.9918585421702073, + "grad_norm": 1.3739922046661377, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8809075355529785, + "num_tokens": 297410298.0, + "step": 7797 + }, + { + "epoch": 0.9919857524487978, + "grad_norm": 1.3709086179733276, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8553755283355713, + "num_tokens": 297454830.0, + "step": 7798 + }, + { + "epoch": 0.9921129627273884, + "grad_norm": 1.5310063362121582, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8679519891738892, + "num_tokens": 297489309.0, + "step": 7799 + }, + { + "epoch": 0.9922401730059789, + "grad_norm": 1.4561411142349243, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8554502725601196, + "num_tokens": 297532940.0, + "step": 7800 + }, + { + "epoch": 0.9923673832845694, + "grad_norm": 1.5808167457580566, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8501796126365662, + "num_tokens": 297574956.0, + "step": 7801 + }, + { + "epoch": 0.9924945935631599, + "grad_norm": 1.5016978979110718, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8688791990280151, + "num_tokens": 297613744.0, + "step": 7802 + }, + { + "epoch": 0.9926218038417504, + "grad_norm": 1.5245059728622437, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8548576831817627, + "num_tokens": 297654142.0, + "step": 7803 + }, + { + "epoch": 0.9927490141203409, + "grad_norm": 1.5092045068740845, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8795726299285889, + "num_tokens": 297688784.0, + "step": 7804 + }, + { + "epoch": 0.9928762243989314, + "grad_norm": 1.4326223134994507, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8675121068954468, + "num_tokens": 297731139.0, + "step": 7805 + }, + { + "epoch": 0.993003434677522, + "grad_norm": 1.4218266010284424, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8767253756523132, + "num_tokens": 297774203.0, + "step": 7806 + }, + { + "epoch": 0.9931306449561125, + "grad_norm": 1.327967882156372, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8757347464561462, + "num_tokens": 297817340.0, + "step": 7807 + }, + { + "epoch": 0.993257855234703, + "grad_norm": 1.4642000198364258, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8661478757858276, + "num_tokens": 297854890.0, + "step": 7808 + }, + { + "epoch": 0.9933850655132934, + "grad_norm": 1.5443871021270752, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8579879403114319, + "num_tokens": 297893477.0, + "step": 7809 + }, + { + "epoch": 0.993512275791884, + "grad_norm": 1.541661262512207, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8717350363731384, + "num_tokens": 297927014.0, + "step": 7810 + }, + { + "epoch": 0.9936394860704745, + "grad_norm": 1.5813785791397095, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8586342930793762, + "num_tokens": 297962776.0, + "step": 7811 + }, + { + "epoch": 0.993766696349065, + "grad_norm": 1.5522888898849487, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8708578944206238, + "num_tokens": 298000279.0, + "step": 7812 + }, + { + "epoch": 0.9938939066276555, + "grad_norm": 1.455369234085083, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8713111877441406, + "num_tokens": 298038811.0, + "step": 7813 + }, + { + "epoch": 0.9940211169062461, + "grad_norm": 1.441435694694519, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.880936861038208, + "num_tokens": 298074500.0, + "step": 7814 + }, + { + "epoch": 0.9941483271848366, + "grad_norm": 1.5433069467544556, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8505284786224365, + "num_tokens": 298114156.0, + "step": 7815 + }, + { + "epoch": 0.994275537463427, + "grad_norm": 1.409489393234253, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8662973642349243, + "num_tokens": 298156660.0, + "step": 7816 + }, + { + "epoch": 0.9944027477420175, + "grad_norm": 1.529221534729004, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8487482070922852, + "num_tokens": 298195373.0, + "step": 7817 + }, + { + "epoch": 0.9945299580206081, + "grad_norm": 1.445786476135254, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8637962341308594, + "num_tokens": 298233917.0, + "step": 7818 + }, + { + "epoch": 0.9946571682991986, + "grad_norm": 1.5530941486358643, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8568334579467773, + "num_tokens": 298268493.0, + "step": 7819 + }, + { + "epoch": 0.9947843785777891, + "grad_norm": 1.3580495119094849, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8696674704551697, + "num_tokens": 298310789.0, + "step": 7820 + }, + { + "epoch": 0.9949115888563796, + "grad_norm": 1.3786066770553589, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8608731031417847, + "num_tokens": 298354253.0, + "step": 7821 + }, + { + "epoch": 0.9950387991349701, + "grad_norm": 1.4138250350952148, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8707594871520996, + "num_tokens": 298395490.0, + "step": 7822 + }, + { + "epoch": 0.9951660094135606, + "grad_norm": 1.3910356760025024, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8762943148612976, + "num_tokens": 298436429.0, + "step": 7823 + }, + { + "epoch": 0.9952932196921511, + "grad_norm": 1.4248578548431396, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.864109992980957, + "num_tokens": 298478209.0, + "step": 7824 + }, + { + "epoch": 0.9954204299707416, + "grad_norm": 1.3682620525360107, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8665541410446167, + "num_tokens": 298520816.0, + "step": 7825 + }, + { + "epoch": 0.9955476402493322, + "grad_norm": 1.4548308849334717, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8562227487564087, + "num_tokens": 298562314.0, + "step": 7826 + }, + { + "epoch": 0.9956748505279227, + "grad_norm": 1.4040167331695557, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8584691286087036, + "num_tokens": 298604854.0, + "step": 7827 + }, + { + "epoch": 0.9958020608065131, + "grad_norm": 1.4767667055130005, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.865235447883606, + "num_tokens": 298641545.0, + "step": 7828 + }, + { + "epoch": 0.9959292710851037, + "grad_norm": 1.4302912950515747, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.85528564453125, + "num_tokens": 298682394.0, + "step": 7829 + }, + { + "epoch": 0.9960564813636942, + "grad_norm": 1.5040491819381714, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.859296977519989, + "num_tokens": 298719968.0, + "step": 7830 + }, + { + "epoch": 0.9961836916422847, + "grad_norm": 1.4775351285934448, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8707439303398132, + "num_tokens": 298756114.0, + "step": 7831 + }, + { + "epoch": 0.9963109019208752, + "grad_norm": 1.5314323902130127, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8466461896896362, + "num_tokens": 298792093.0, + "step": 7832 + }, + { + "epoch": 0.9964381121994658, + "grad_norm": 1.4431275129318237, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8670861721038818, + "num_tokens": 298831948.0, + "step": 7833 + }, + { + "epoch": 0.9965653224780562, + "grad_norm": 1.5047426223754883, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8722972869873047, + "num_tokens": 298868611.0, + "step": 7834 + }, + { + "epoch": 0.9966925327566467, + "grad_norm": 1.6898396015167236, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8633193969726562, + "num_tokens": 298898982.0, + "step": 7835 + }, + { + "epoch": 0.9968197430352372, + "grad_norm": 1.3687671422958374, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8750717639923096, + "num_tokens": 298940845.0, + "step": 7836 + }, + { + "epoch": 0.9969469533138278, + "grad_norm": 1.614525318145752, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8702936768531799, + "num_tokens": 298973221.0, + "step": 7837 + }, + { + "epoch": 0.9970741635924183, + "grad_norm": 1.6273653507232666, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8604258298873901, + "num_tokens": 299006994.0, + "step": 7838 + }, + { + "epoch": 0.9972013738710088, + "grad_norm": 1.4136241674423218, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8809376955032349, + "num_tokens": 299046473.0, + "step": 7839 + }, + { + "epoch": 0.9973285841495992, + "grad_norm": 1.434241533279419, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.878603994846344, + "num_tokens": 299086613.0, + "step": 7840 + }, + { + "epoch": 0.9974557944281898, + "grad_norm": 1.4089769124984741, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8714463710784912, + "num_tokens": 299126525.0, + "step": 7841 + }, + { + "epoch": 0.9975830047067803, + "grad_norm": 1.4757622480392456, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.859445333480835, + "num_tokens": 299169346.0, + "step": 7842 + }, + { + "epoch": 0.9977102149853708, + "grad_norm": 1.551023006439209, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8515117168426514, + "num_tokens": 299204638.0, + "step": 7843 + }, + { + "epoch": 0.9978374252639614, + "grad_norm": 1.379693865776062, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8824375867843628, + "num_tokens": 299244148.0, + "step": 7844 + }, + { + "epoch": 0.9979646355425519, + "grad_norm": 1.4022555351257324, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8673619627952576, + "num_tokens": 299284732.0, + "step": 7845 + }, + { + "epoch": 0.9980918458211423, + "grad_norm": 1.42646062374115, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8597549200057983, + "num_tokens": 299326159.0, + "step": 7846 + }, + { + "epoch": 0.9982190560997328, + "grad_norm": 1.4466458559036255, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8731642365455627, + "num_tokens": 299364305.0, + "step": 7847 + }, + { + "epoch": 0.9983462663783234, + "grad_norm": 1.5668847560882568, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.86457359790802, + "num_tokens": 299398701.0, + "step": 7848 + }, + { + "epoch": 0.9984734766569139, + "grad_norm": 1.5031510591506958, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8639260530471802, + "num_tokens": 299438370.0, + "step": 7849 + }, + { + "epoch": 0.9986006869355044, + "grad_norm": 1.4010915756225586, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8784668445587158, + "num_tokens": 299477728.0, + "step": 7850 + }, + { + "epoch": 0.9987278972140949, + "grad_norm": 1.6322740316390991, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8612604141235352, + "num_tokens": 299510495.0, + "step": 7851 + }, + { + "epoch": 0.9988551074926854, + "grad_norm": 1.4541354179382324, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8786892890930176, + "num_tokens": 299546954.0, + "step": 7852 + }, + { + "epoch": 0.9989823177712759, + "grad_norm": 1.404349446296692, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8776335716247559, + "num_tokens": 299588597.0, + "step": 7853 + }, + { + "epoch": 0.9991095280498664, + "grad_norm": 1.6414769887924194, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8692774176597595, + "num_tokens": 299622206.0, + "step": 7854 + }, + { + "epoch": 0.9992367383284569, + "grad_norm": 1.4738359451293945, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8704960942268372, + "num_tokens": 299659660.0, + "step": 7855 + }, + { + "epoch": 0.9993639486070475, + "grad_norm": 1.3377799987792969, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8826562166213989, + "num_tokens": 299699931.0, + "step": 7856 + }, + { + "epoch": 0.999491158885638, + "grad_norm": 1.535750150680542, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8772526383399963, + "num_tokens": 299734145.0, + "step": 7857 + }, + { + "epoch": 0.9996183691642284, + "grad_norm": 1.4967947006225586, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8535463809967041, + "num_tokens": 299773673.0, + "step": 7858 + }, + { + "epoch": 0.9997455794428189, + "grad_norm": 1.532828450202942, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8668411374092102, + "num_tokens": 299810272.0, + "step": 7859 + }, + { + "epoch": 0.9998727897214095, + "grad_norm": 1.5289874076843262, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.857785701751709, + "num_tokens": 299848639.0, + "step": 7860 + }, + { + "epoch": 1.0, + "grad_norm": 1.5053308010101318, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8719073534011841, + "num_tokens": 299886327.0, + "step": 7861 + }, + { + "epoch": 1.0001272102785905, + "grad_norm": 1.2968192100524902, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8775171637535095, + "num_tokens": 299927606.0, + "step": 7862 + }, + { + "epoch": 1.000254420557181, + "grad_norm": 1.4607365131378174, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8734209537506104, + "num_tokens": 299966521.0, + "step": 7863 + }, + { + "epoch": 1.0003816308357716, + "grad_norm": 1.3720906972885132, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8809123635292053, + "num_tokens": 300007426.0, + "step": 7864 + }, + { + "epoch": 1.0005088411143621, + "grad_norm": 1.463454246520996, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8748360276222229, + "num_tokens": 300043668.0, + "step": 7865 + }, + { + "epoch": 1.0006360513929526, + "grad_norm": 1.4354243278503418, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8725414872169495, + "num_tokens": 300081477.0, + "step": 7866 + }, + { + "epoch": 1.0007632616715432, + "grad_norm": 1.3992255926132202, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8817757368087769, + "num_tokens": 300122652.0, + "step": 7867 + }, + { + "epoch": 1.0008904719501335, + "grad_norm": 1.42557954788208, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8699647188186646, + "num_tokens": 300165563.0, + "step": 7868 + }, + { + "epoch": 1.001017682228724, + "grad_norm": 1.5314997434616089, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8788227438926697, + "num_tokens": 300202282.0, + "step": 7869 + }, + { + "epoch": 1.0011448925073145, + "grad_norm": 1.4886828660964966, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8775930404663086, + "num_tokens": 300245267.0, + "step": 7870 + }, + { + "epoch": 1.001272102785905, + "grad_norm": 1.4765273332595825, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8739020228385925, + "num_tokens": 300286912.0, + "step": 7871 + }, + { + "epoch": 1.0013993130644956, + "grad_norm": 1.5229086875915527, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.879496693611145, + "num_tokens": 300325061.0, + "step": 7872 + }, + { + "epoch": 1.0015265233430861, + "grad_norm": 1.4736716747283936, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8666084408760071, + "num_tokens": 300363667.0, + "step": 7873 + }, + { + "epoch": 1.0016537336216766, + "grad_norm": 1.5068727731704712, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8672997355461121, + "num_tokens": 300404925.0, + "step": 7874 + }, + { + "epoch": 1.0017809439002672, + "grad_norm": 1.4424774646759033, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8799276351928711, + "num_tokens": 300446620.0, + "step": 7875 + }, + { + "epoch": 1.0019081541788577, + "grad_norm": 1.5557291507720947, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8750923871994019, + "num_tokens": 300484284.0, + "step": 7876 + }, + { + "epoch": 1.0020353644574482, + "grad_norm": 1.581343173980713, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8648766279220581, + "num_tokens": 300523082.0, + "step": 7877 + }, + { + "epoch": 1.0021625747360388, + "grad_norm": 1.6283950805664062, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8782079219818115, + "num_tokens": 300555733.0, + "step": 7878 + }, + { + "epoch": 1.0022897850146293, + "grad_norm": 1.6468466520309448, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.882230818271637, + "num_tokens": 300585379.0, + "step": 7879 + }, + { + "epoch": 1.0024169952932196, + "grad_norm": 1.5495697259902954, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8717268705368042, + "num_tokens": 300622518.0, + "step": 7880 + }, + { + "epoch": 1.0025442055718101, + "grad_norm": 1.616597294807434, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8606989979743958, + "num_tokens": 300657609.0, + "step": 7881 + }, + { + "epoch": 1.0026714158504006, + "grad_norm": 1.435399055480957, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8753635287284851, + "num_tokens": 300699532.0, + "step": 7882 + }, + { + "epoch": 1.0027986261289912, + "grad_norm": 1.7177057266235352, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.878862738609314, + "num_tokens": 300736820.0, + "step": 7883 + }, + { + "epoch": 1.0029258364075817, + "grad_norm": 1.3624999523162842, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8955199718475342, + "num_tokens": 300776376.0, + "step": 7884 + }, + { + "epoch": 1.0030530466861722, + "grad_norm": 1.6658923625946045, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8610216975212097, + "num_tokens": 300813130.0, + "step": 7885 + }, + { + "epoch": 1.0031802569647628, + "grad_norm": 1.427025556564331, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8681780099868774, + "num_tokens": 300853065.0, + "step": 7886 + }, + { + "epoch": 1.0033074672433533, + "grad_norm": 1.4373975992202759, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8776798248291016, + "num_tokens": 300894912.0, + "step": 7887 + }, + { + "epoch": 1.0034346775219438, + "grad_norm": 1.5016714334487915, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8740493655204773, + "num_tokens": 300935457.0, + "step": 7888 + }, + { + "epoch": 1.0035618878005343, + "grad_norm": 1.4828649759292603, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8880788087844849, + "num_tokens": 300976177.0, + "step": 7889 + }, + { + "epoch": 1.0036890980791249, + "grad_norm": 1.4809235334396362, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8656808137893677, + "num_tokens": 301016343.0, + "step": 7890 + }, + { + "epoch": 1.0038163083577154, + "grad_norm": 1.5481562614440918, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8735300898551941, + "num_tokens": 301055492.0, + "step": 7891 + }, + { + "epoch": 1.0039435186363057, + "grad_norm": 1.479792594909668, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.88230299949646, + "num_tokens": 301093139.0, + "step": 7892 + }, + { + "epoch": 1.0040707289148962, + "grad_norm": 1.5607446432113647, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8715561628341675, + "num_tokens": 301131596.0, + "step": 7893 + }, + { + "epoch": 1.0041979391934868, + "grad_norm": 1.5210810899734497, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8686795234680176, + "num_tokens": 301167108.0, + "step": 7894 + }, + { + "epoch": 1.0043251494720773, + "grad_norm": 1.4601500034332275, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8773539066314697, + "num_tokens": 301208654.0, + "step": 7895 + }, + { + "epoch": 1.0044523597506678, + "grad_norm": 1.5258361101150513, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.875260591506958, + "num_tokens": 301246677.0, + "step": 7896 + }, + { + "epoch": 1.0045795700292584, + "grad_norm": 1.5905539989471436, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8721683621406555, + "num_tokens": 301282225.0, + "step": 7897 + }, + { + "epoch": 1.0047067803078489, + "grad_norm": 1.408057451248169, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8751416206359863, + "num_tokens": 301322636.0, + "step": 7898 + }, + { + "epoch": 1.0048339905864394, + "grad_norm": 1.421999454498291, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8684271574020386, + "num_tokens": 301367528.0, + "step": 7899 + }, + { + "epoch": 1.00496120086503, + "grad_norm": 1.433870792388916, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8760766386985779, + "num_tokens": 301409298.0, + "step": 7900 + }, + { + "epoch": 1.0050884111436205, + "grad_norm": 1.5552879571914673, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8654172420501709, + "num_tokens": 301442848.0, + "step": 7901 + }, + { + "epoch": 1.005215621422211, + "grad_norm": 1.4011383056640625, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8820862770080566, + "num_tokens": 301482814.0, + "step": 7902 + }, + { + "epoch": 1.0053428317008015, + "grad_norm": 1.4638361930847168, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8875898718833923, + "num_tokens": 301521590.0, + "step": 7903 + }, + { + "epoch": 1.0054700419793918, + "grad_norm": 1.4292290210723877, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8876733779907227, + "num_tokens": 301560710.0, + "step": 7904 + }, + { + "epoch": 1.0055972522579824, + "grad_norm": 1.624416708946228, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8657959699630737, + "num_tokens": 301595872.0, + "step": 7905 + }, + { + "epoch": 1.0057244625365729, + "grad_norm": 1.5209134817123413, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8805603981018066, + "num_tokens": 301631654.0, + "step": 7906 + }, + { + "epoch": 1.0058516728151634, + "grad_norm": 1.4995049238204956, + "learning_rate": 1e-06, + "loss": 0.2888, + "mean_token_accuracy": 0.895361065864563, + "num_tokens": 301667537.0, + "step": 7907 + }, + { + "epoch": 1.005978883093754, + "grad_norm": 1.5202146768569946, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8621624708175659, + "num_tokens": 301709196.0, + "step": 7908 + }, + { + "epoch": 1.0061060933723445, + "grad_norm": 1.4532651901245117, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8832564949989319, + "num_tokens": 301746165.0, + "step": 7909 + }, + { + "epoch": 1.006233303650935, + "grad_norm": 1.6216799020767212, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8716909885406494, + "num_tokens": 301781287.0, + "step": 7910 + }, + { + "epoch": 1.0063605139295255, + "grad_norm": 1.5710070133209229, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8815634846687317, + "num_tokens": 301818208.0, + "step": 7911 + }, + { + "epoch": 1.006487724208116, + "grad_norm": 1.4834744930267334, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8646314740180969, + "num_tokens": 301859021.0, + "step": 7912 + }, + { + "epoch": 1.0066149344867066, + "grad_norm": 1.5297300815582275, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8895989656448364, + "num_tokens": 301896265.0, + "step": 7913 + }, + { + "epoch": 1.006742144765297, + "grad_norm": 1.4786465167999268, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.877132773399353, + "num_tokens": 301937092.0, + "step": 7914 + }, + { + "epoch": 1.0068693550438876, + "grad_norm": 1.4629371166229248, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8762245178222656, + "num_tokens": 301974595.0, + "step": 7915 + }, + { + "epoch": 1.0069965653224782, + "grad_norm": 1.4776039123535156, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8718482255935669, + "num_tokens": 302017216.0, + "step": 7916 + }, + { + "epoch": 1.0071237756010685, + "grad_norm": 1.5547243356704712, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8936114311218262, + "num_tokens": 302050811.0, + "step": 7917 + }, + { + "epoch": 1.007250985879659, + "grad_norm": 1.519921898841858, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8781371116638184, + "num_tokens": 302091764.0, + "step": 7918 + }, + { + "epoch": 1.0073781961582495, + "grad_norm": 1.4923226833343506, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8725480437278748, + "num_tokens": 302131881.0, + "step": 7919 + }, + { + "epoch": 1.00750540643684, + "grad_norm": 1.549397349357605, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8621697425842285, + "num_tokens": 302168199.0, + "step": 7920 + }, + { + "epoch": 1.0076326167154306, + "grad_norm": 1.4993551969528198, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8808548450469971, + "num_tokens": 302206595.0, + "step": 7921 + }, + { + "epoch": 1.0077598269940211, + "grad_norm": 1.5038615465164185, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8782718777656555, + "num_tokens": 302241138.0, + "step": 7922 + }, + { + "epoch": 1.0078870372726116, + "grad_norm": 1.4599984884262085, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8829259872436523, + "num_tokens": 302282054.0, + "step": 7923 + }, + { + "epoch": 1.0080142475512022, + "grad_norm": 1.622754454612732, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.865217387676239, + "num_tokens": 302321608.0, + "step": 7924 + }, + { + "epoch": 1.0081414578297927, + "grad_norm": 1.4703093767166138, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8773249983787537, + "num_tokens": 302359638.0, + "step": 7925 + }, + { + "epoch": 1.0082686681083832, + "grad_norm": 1.6308742761611938, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8747931718826294, + "num_tokens": 302392131.0, + "step": 7926 + }, + { + "epoch": 1.0083958783869738, + "grad_norm": 1.4457272291183472, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8782107830047607, + "num_tokens": 302432145.0, + "step": 7927 + }, + { + "epoch": 1.0085230886655643, + "grad_norm": 1.5369303226470947, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8655789494514465, + "num_tokens": 302470029.0, + "step": 7928 + }, + { + "epoch": 1.0086502989441546, + "grad_norm": 1.3640737533569336, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8895848989486694, + "num_tokens": 302512579.0, + "step": 7929 + }, + { + "epoch": 1.0087775092227451, + "grad_norm": 1.4689394235610962, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8845929503440857, + "num_tokens": 302549549.0, + "step": 7930 + }, + { + "epoch": 1.0089047195013356, + "grad_norm": 1.5346808433532715, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.883368194103241, + "num_tokens": 302584073.0, + "step": 7931 + }, + { + "epoch": 1.0090319297799262, + "grad_norm": 1.4356410503387451, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8774522542953491, + "num_tokens": 302625846.0, + "step": 7932 + }, + { + "epoch": 1.0091591400585167, + "grad_norm": 1.483662724494934, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8721594214439392, + "num_tokens": 302666414.0, + "step": 7933 + }, + { + "epoch": 1.0092863503371072, + "grad_norm": 1.5660831928253174, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8747916221618652, + "num_tokens": 302701493.0, + "step": 7934 + }, + { + "epoch": 1.0094135606156978, + "grad_norm": 1.5473625659942627, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8664277791976929, + "num_tokens": 302737548.0, + "step": 7935 + }, + { + "epoch": 1.0095407708942883, + "grad_norm": 1.468260407447815, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8868747353553772, + "num_tokens": 302773855.0, + "step": 7936 + }, + { + "epoch": 1.0096679811728788, + "grad_norm": 1.4604041576385498, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8798571228981018, + "num_tokens": 302812704.0, + "step": 7937 + }, + { + "epoch": 1.0097951914514693, + "grad_norm": 1.6992658376693726, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8570077419281006, + "num_tokens": 302845767.0, + "step": 7938 + }, + { + "epoch": 1.0099224017300599, + "grad_norm": 1.516837239265442, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8750150799751282, + "num_tokens": 302884689.0, + "step": 7939 + }, + { + "epoch": 1.0100496120086504, + "grad_norm": 1.4099290370941162, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8672130107879639, + "num_tokens": 302926921.0, + "step": 7940 + }, + { + "epoch": 1.0101768222872407, + "grad_norm": 1.5140769481658936, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8813557624816895, + "num_tokens": 302963665.0, + "step": 7941 + }, + { + "epoch": 1.0103040325658312, + "grad_norm": 1.5015391111373901, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8673336505889893, + "num_tokens": 303005132.0, + "step": 7942 + }, + { + "epoch": 1.0104312428444218, + "grad_norm": 1.4294236898422241, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8809908628463745, + "num_tokens": 303046753.0, + "step": 7943 + }, + { + "epoch": 1.0105584531230123, + "grad_norm": 1.581008791923523, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8913804292678833, + "num_tokens": 303079260.0, + "step": 7944 + }, + { + "epoch": 1.0106856634016028, + "grad_norm": 1.4775961637496948, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8751107454299927, + "num_tokens": 303118187.0, + "step": 7945 + }, + { + "epoch": 1.0108128736801933, + "grad_norm": 1.532211422920227, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8696709871292114, + "num_tokens": 303155358.0, + "step": 7946 + }, + { + "epoch": 1.0109400839587839, + "grad_norm": 1.4635120630264282, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.863663911819458, + "num_tokens": 303198412.0, + "step": 7947 + }, + { + "epoch": 1.0110672942373744, + "grad_norm": 1.670495629310608, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8584470748901367, + "num_tokens": 303234773.0, + "step": 7948 + }, + { + "epoch": 1.011194504515965, + "grad_norm": 1.6496033668518066, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8718281984329224, + "num_tokens": 303270318.0, + "step": 7949 + }, + { + "epoch": 1.0113217147945555, + "grad_norm": 1.7689077854156494, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8750084638595581, + "num_tokens": 303297457.0, + "step": 7950 + }, + { + "epoch": 1.011448925073146, + "grad_norm": 1.5818848609924316, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8523095846176147, + "num_tokens": 303335813.0, + "step": 7951 + }, + { + "epoch": 1.0115761353517365, + "grad_norm": 1.5774250030517578, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8568658828735352, + "num_tokens": 303377181.0, + "step": 7952 + }, + { + "epoch": 1.0117033456303268, + "grad_norm": 1.674338459968567, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.855583667755127, + "num_tokens": 303410440.0, + "step": 7953 + }, + { + "epoch": 1.0118305559089174, + "grad_norm": 1.5457532405853271, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8824589252471924, + "num_tokens": 303443405.0, + "step": 7954 + }, + { + "epoch": 1.0119577661875079, + "grad_norm": 1.4510642290115356, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8819807767868042, + "num_tokens": 303485333.0, + "step": 7955 + }, + { + "epoch": 1.0120849764660984, + "grad_norm": 1.9008395671844482, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.876018226146698, + "num_tokens": 303513732.0, + "step": 7956 + }, + { + "epoch": 1.012212186744689, + "grad_norm": 1.4757670164108276, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8777321577072144, + "num_tokens": 303550685.0, + "step": 7957 + }, + { + "epoch": 1.0123393970232795, + "grad_norm": 1.587172508239746, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8740012645721436, + "num_tokens": 303586516.0, + "step": 7958 + }, + { + "epoch": 1.01246660730187, + "grad_norm": 1.6977249383926392, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8811272382736206, + "num_tokens": 303620261.0, + "step": 7959 + }, + { + "epoch": 1.0125938175804605, + "grad_norm": 1.5709291696548462, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8581123352050781, + "num_tokens": 303658681.0, + "step": 7960 + }, + { + "epoch": 1.012721027859051, + "grad_norm": 1.5694557428359985, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8692374229431152, + "num_tokens": 303696236.0, + "step": 7961 + }, + { + "epoch": 1.0128482381376416, + "grad_norm": 1.5045652389526367, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8844860792160034, + "num_tokens": 303733298.0, + "step": 7962 + }, + { + "epoch": 1.012975448416232, + "grad_norm": 1.609734296798706, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8612666726112366, + "num_tokens": 303774256.0, + "step": 7963 + }, + { + "epoch": 1.0131026586948226, + "grad_norm": 1.4946743249893188, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8762351274490356, + "num_tokens": 303812614.0, + "step": 7964 + }, + { + "epoch": 1.0132298689734132, + "grad_norm": 1.6508121490478516, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8591039180755615, + "num_tokens": 303846733.0, + "step": 7965 + }, + { + "epoch": 1.0133570792520035, + "grad_norm": 1.469547986984253, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.884152889251709, + "num_tokens": 303885785.0, + "step": 7966 + }, + { + "epoch": 1.013484289530594, + "grad_norm": 1.4930901527404785, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8684060573577881, + "num_tokens": 303924722.0, + "step": 7967 + }, + { + "epoch": 1.0136114998091845, + "grad_norm": 1.519992709159851, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8641586303710938, + "num_tokens": 303966361.0, + "step": 7968 + }, + { + "epoch": 1.013738710087775, + "grad_norm": 1.483333706855774, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8747153282165527, + "num_tokens": 304007912.0, + "step": 7969 + }, + { + "epoch": 1.0138659203663656, + "grad_norm": 1.5716603994369507, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8719639778137207, + "num_tokens": 304043882.0, + "step": 7970 + }, + { + "epoch": 1.013993130644956, + "grad_norm": 1.469815731048584, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8852038383483887, + "num_tokens": 304079903.0, + "step": 7971 + }, + { + "epoch": 1.0141203409235466, + "grad_norm": 1.4198079109191895, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8718347549438477, + "num_tokens": 304124824.0, + "step": 7972 + }, + { + "epoch": 1.0142475512021372, + "grad_norm": 1.831606149673462, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.85695481300354, + "num_tokens": 304157970.0, + "step": 7973 + }, + { + "epoch": 1.0143747614807277, + "grad_norm": 1.6377633810043335, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8834750652313232, + "num_tokens": 304192366.0, + "step": 7974 + }, + { + "epoch": 1.0145019717593182, + "grad_norm": 1.6143081188201904, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8844317197799683, + "num_tokens": 304226016.0, + "step": 7975 + }, + { + "epoch": 1.0146291820379088, + "grad_norm": 1.4337767362594604, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8905776143074036, + "num_tokens": 304263725.0, + "step": 7976 + }, + { + "epoch": 1.0147563923164993, + "grad_norm": 1.4901435375213623, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8745937347412109, + "num_tokens": 304302573.0, + "step": 7977 + }, + { + "epoch": 1.0148836025950896, + "grad_norm": 1.4545459747314453, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8700743913650513, + "num_tokens": 304345987.0, + "step": 7978 + }, + { + "epoch": 1.0150108128736801, + "grad_norm": 1.5836069583892822, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8810768723487854, + "num_tokens": 304378950.0, + "step": 7979 + }, + { + "epoch": 1.0151380231522706, + "grad_norm": 1.3761968612670898, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.888718843460083, + "num_tokens": 304418498.0, + "step": 7980 + }, + { + "epoch": 1.0152652334308612, + "grad_norm": 1.39496910572052, + "learning_rate": 1e-06, + "loss": 0.2802, + "mean_token_accuracy": 0.8993586301803589, + "num_tokens": 304455175.0, + "step": 7981 + }, + { + "epoch": 1.0153924437094517, + "grad_norm": 1.4605233669281006, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8797218799591064, + "num_tokens": 304492342.0, + "step": 7982 + }, + { + "epoch": 1.0155196539880422, + "grad_norm": 1.4360498189926147, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8849352598190308, + "num_tokens": 304531784.0, + "step": 7983 + }, + { + "epoch": 1.0156468642666328, + "grad_norm": 1.473415493965149, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8847346305847168, + "num_tokens": 304568159.0, + "step": 7984 + }, + { + "epoch": 1.0157740745452233, + "grad_norm": 1.623297929763794, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8788580298423767, + "num_tokens": 304602273.0, + "step": 7985 + }, + { + "epoch": 1.0159012848238138, + "grad_norm": 1.6524276733398438, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.868777334690094, + "num_tokens": 304636252.0, + "step": 7986 + }, + { + "epoch": 1.0160284951024043, + "grad_norm": 1.8877586126327515, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8918325901031494, + "num_tokens": 304675464.0, + "step": 7987 + }, + { + "epoch": 1.0161557053809949, + "grad_norm": 1.595668911933899, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8876293897628784, + "num_tokens": 304708675.0, + "step": 7988 + }, + { + "epoch": 1.0162829156595854, + "grad_norm": 1.5074851512908936, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8649052977561951, + "num_tokens": 304745015.0, + "step": 7989 + }, + { + "epoch": 1.0164101259381757, + "grad_norm": 1.7109429836273193, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8562365770339966, + "num_tokens": 304779763.0, + "step": 7990 + }, + { + "epoch": 1.0165373362167662, + "grad_norm": 1.4224612712860107, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8957374095916748, + "num_tokens": 304815698.0, + "step": 7991 + }, + { + "epoch": 1.0166645464953568, + "grad_norm": 1.4509658813476562, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8845776319503784, + "num_tokens": 304852128.0, + "step": 7992 + }, + { + "epoch": 1.0167917567739473, + "grad_norm": 1.475152850151062, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8783196210861206, + "num_tokens": 304893675.0, + "step": 7993 + }, + { + "epoch": 1.0169189670525378, + "grad_norm": 1.5109450817108154, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8704858422279358, + "num_tokens": 304933443.0, + "step": 7994 + }, + { + "epoch": 1.0170461773311283, + "grad_norm": 1.4290679693222046, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8856902718544006, + "num_tokens": 304977822.0, + "step": 7995 + }, + { + "epoch": 1.0171733876097189, + "grad_norm": 1.450157642364502, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8688849210739136, + "num_tokens": 305019296.0, + "step": 7996 + }, + { + "epoch": 1.0173005978883094, + "grad_norm": 1.5252100229263306, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8832630515098572, + "num_tokens": 305057645.0, + "step": 7997 + }, + { + "epoch": 1.0174278081669, + "grad_norm": 1.6054459810256958, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8728286623954773, + "num_tokens": 305095814.0, + "step": 7998 + }, + { + "epoch": 1.0175550184454905, + "grad_norm": 1.408355712890625, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8641796112060547, + "num_tokens": 305140003.0, + "step": 7999 + }, + { + "epoch": 1.017682228724081, + "grad_norm": 1.3493471145629883, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8802604675292969, + "num_tokens": 305183928.0, + "step": 8000 + }, + { + "epoch": 1.0178094390026715, + "grad_norm": 1.4318426847457886, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8709062337875366, + "num_tokens": 305227209.0, + "step": 8001 + }, + { + "epoch": 1.0179366492812618, + "grad_norm": 1.6219277381896973, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8804780840873718, + "num_tokens": 305259431.0, + "step": 8002 + }, + { + "epoch": 1.0180638595598523, + "grad_norm": 1.8529446125030518, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8672356605529785, + "num_tokens": 305294644.0, + "step": 8003 + }, + { + "epoch": 1.0181910698384429, + "grad_norm": 1.603283166885376, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8825134038925171, + "num_tokens": 305327718.0, + "step": 8004 + }, + { + "epoch": 1.0183182801170334, + "grad_norm": 1.472460150718689, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8748801350593567, + "num_tokens": 305366958.0, + "step": 8005 + }, + { + "epoch": 1.018445490395624, + "grad_norm": 1.5206024646759033, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8706610202789307, + "num_tokens": 305407546.0, + "step": 8006 + }, + { + "epoch": 1.0185727006742145, + "grad_norm": 1.5118883848190308, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8854106068611145, + "num_tokens": 305447525.0, + "step": 8007 + }, + { + "epoch": 1.018699910952805, + "grad_norm": 1.697147250175476, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8776001930236816, + "num_tokens": 305485274.0, + "step": 8008 + }, + { + "epoch": 1.0188271212313955, + "grad_norm": 1.5352851152420044, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8736502528190613, + "num_tokens": 305524269.0, + "step": 8009 + }, + { + "epoch": 1.018954331509986, + "grad_norm": 1.482484221458435, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8700807094573975, + "num_tokens": 305565565.0, + "step": 8010 + }, + { + "epoch": 1.0190815417885766, + "grad_norm": 1.5410172939300537, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8673273324966431, + "num_tokens": 305604156.0, + "step": 8011 + }, + { + "epoch": 1.019208752067167, + "grad_norm": 1.5668470859527588, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8756124973297119, + "num_tokens": 305638951.0, + "step": 8012 + }, + { + "epoch": 1.0193359623457576, + "grad_norm": 1.498382329940796, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8790438175201416, + "num_tokens": 305677338.0, + "step": 8013 + }, + { + "epoch": 1.0194631726243482, + "grad_norm": 1.6432756185531616, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8854495882987976, + "num_tokens": 305711133.0, + "step": 8014 + }, + { + "epoch": 1.0195903829029385, + "grad_norm": 1.4934653043746948, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8678282499313354, + "num_tokens": 305753668.0, + "step": 8015 + }, + { + "epoch": 1.019717593181529, + "grad_norm": 1.5292919874191284, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8748599290847778, + "num_tokens": 305791940.0, + "step": 8016 + }, + { + "epoch": 1.0198448034601195, + "grad_norm": 1.5058907270431519, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8844411373138428, + "num_tokens": 305833639.0, + "step": 8017 + }, + { + "epoch": 1.01997201373871, + "grad_norm": 1.5320236682891846, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8567305207252502, + "num_tokens": 305874996.0, + "step": 8018 + }, + { + "epoch": 1.0200992240173006, + "grad_norm": 1.4176527261734009, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8844088315963745, + "num_tokens": 305914395.0, + "step": 8019 + }, + { + "epoch": 1.020226434295891, + "grad_norm": 1.6789451837539673, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8579028844833374, + "num_tokens": 305949996.0, + "step": 8020 + }, + { + "epoch": 1.0203536445744816, + "grad_norm": 1.4057917594909668, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8774282932281494, + "num_tokens": 305994074.0, + "step": 8021 + }, + { + "epoch": 1.0204808548530722, + "grad_norm": 1.4701893329620361, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8849467039108276, + "num_tokens": 306032077.0, + "step": 8022 + }, + { + "epoch": 1.0206080651316627, + "grad_norm": 1.4948878288269043, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8886460065841675, + "num_tokens": 306066746.0, + "step": 8023 + }, + { + "epoch": 1.0207352754102532, + "grad_norm": 1.412398099899292, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.882180392742157, + "num_tokens": 306106347.0, + "step": 8024 + }, + { + "epoch": 1.0208624856888437, + "grad_norm": 1.5062975883483887, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8763192296028137, + "num_tokens": 306145490.0, + "step": 8025 + }, + { + "epoch": 1.0209896959674343, + "grad_norm": 1.5908430814743042, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8680547475814819, + "num_tokens": 306184328.0, + "step": 8026 + }, + { + "epoch": 1.0211169062460246, + "grad_norm": 1.35675048828125, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8742300868034363, + "num_tokens": 306229612.0, + "step": 8027 + }, + { + "epoch": 1.021244116524615, + "grad_norm": 1.5256978273391724, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8805558085441589, + "num_tokens": 306263945.0, + "step": 8028 + }, + { + "epoch": 1.0213713268032056, + "grad_norm": 1.4916484355926514, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8674150705337524, + "num_tokens": 306305241.0, + "step": 8029 + }, + { + "epoch": 1.0214985370817962, + "grad_norm": 1.5183625221252441, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8687312602996826, + "num_tokens": 306342784.0, + "step": 8030 + }, + { + "epoch": 1.0216257473603867, + "grad_norm": 1.4241100549697876, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8773568868637085, + "num_tokens": 306383269.0, + "step": 8031 + }, + { + "epoch": 1.0217529576389772, + "grad_norm": 1.6145107746124268, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8702007532119751, + "num_tokens": 306415754.0, + "step": 8032 + }, + { + "epoch": 1.0218801679175677, + "grad_norm": 1.4048678874969482, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8836636543273926, + "num_tokens": 306455483.0, + "step": 8033 + }, + { + "epoch": 1.0220073781961583, + "grad_norm": 1.4730308055877686, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8612672686576843, + "num_tokens": 306493703.0, + "step": 8034 + }, + { + "epoch": 1.0221345884747488, + "grad_norm": 1.5157952308654785, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8712400794029236, + "num_tokens": 306528110.0, + "step": 8035 + }, + { + "epoch": 1.0222617987533393, + "grad_norm": 1.6253066062927246, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8782959580421448, + "num_tokens": 306559663.0, + "step": 8036 + }, + { + "epoch": 1.0223890090319299, + "grad_norm": 1.4476685523986816, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8774175643920898, + "num_tokens": 306598465.0, + "step": 8037 + }, + { + "epoch": 1.0225162193105204, + "grad_norm": 1.382270097732544, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8794733285903931, + "num_tokens": 306639104.0, + "step": 8038 + }, + { + "epoch": 1.0226434295891107, + "grad_norm": 1.4925570487976074, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8731963038444519, + "num_tokens": 306677078.0, + "step": 8039 + }, + { + "epoch": 1.0227706398677012, + "grad_norm": 1.623736023902893, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8757055401802063, + "num_tokens": 306711854.0, + "step": 8040 + }, + { + "epoch": 1.0228978501462918, + "grad_norm": 1.5312023162841797, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8692095279693604, + "num_tokens": 306750194.0, + "step": 8041 + }, + { + "epoch": 1.0230250604248823, + "grad_norm": 1.464164137840271, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8770246505737305, + "num_tokens": 306789758.0, + "step": 8042 + }, + { + "epoch": 1.0231522707034728, + "grad_norm": 1.5851696729660034, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8705188632011414, + "num_tokens": 306826976.0, + "step": 8043 + }, + { + "epoch": 1.0232794809820633, + "grad_norm": 1.4740312099456787, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8730340003967285, + "num_tokens": 306868558.0, + "step": 8044 + }, + { + "epoch": 1.0234066912606539, + "grad_norm": 1.390657901763916, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8751778602600098, + "num_tokens": 306914624.0, + "step": 8045 + }, + { + "epoch": 1.0235339015392444, + "grad_norm": 1.4660674333572388, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.871540367603302, + "num_tokens": 306955569.0, + "step": 8046 + }, + { + "epoch": 1.023661111817835, + "grad_norm": 1.474053978919983, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.865077555179596, + "num_tokens": 307001975.0, + "step": 8047 + }, + { + "epoch": 1.0237883220964255, + "grad_norm": 1.6318702697753906, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8682560920715332, + "num_tokens": 307039949.0, + "step": 8048 + }, + { + "epoch": 1.023915532375016, + "grad_norm": 1.5680426359176636, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8712133765220642, + "num_tokens": 307076241.0, + "step": 8049 + }, + { + "epoch": 1.0240427426536065, + "grad_norm": 1.4253441095352173, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8805038928985596, + "num_tokens": 307122638.0, + "step": 8050 + }, + { + "epoch": 1.0241699529321968, + "grad_norm": 1.4003455638885498, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8720089197158813, + "num_tokens": 307163502.0, + "step": 8051 + }, + { + "epoch": 1.0242971632107873, + "grad_norm": 1.6285407543182373, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8672277927398682, + "num_tokens": 307201306.0, + "step": 8052 + }, + { + "epoch": 1.0244243734893779, + "grad_norm": 1.518847942352295, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8920981287956238, + "num_tokens": 307233668.0, + "step": 8053 + }, + { + "epoch": 1.0245515837679684, + "grad_norm": 1.6533385515213013, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8675210475921631, + "num_tokens": 307270225.0, + "step": 8054 + }, + { + "epoch": 1.024678794046559, + "grad_norm": 1.3924583196640015, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8843867778778076, + "num_tokens": 307312006.0, + "step": 8055 + }, + { + "epoch": 1.0248060043251495, + "grad_norm": 1.5533225536346436, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8719732761383057, + "num_tokens": 307351286.0, + "step": 8056 + }, + { + "epoch": 1.02493321460374, + "grad_norm": 1.4693189859390259, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.866338312625885, + "num_tokens": 307396035.0, + "step": 8057 + }, + { + "epoch": 1.0250604248823305, + "grad_norm": 1.6996941566467285, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8734694123268127, + "num_tokens": 307427069.0, + "step": 8058 + }, + { + "epoch": 1.025187635160921, + "grad_norm": 1.5880701541900635, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8722317218780518, + "num_tokens": 307465675.0, + "step": 8059 + }, + { + "epoch": 1.0253148454395116, + "grad_norm": 1.5500643253326416, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8770252466201782, + "num_tokens": 307505082.0, + "step": 8060 + }, + { + "epoch": 1.025442055718102, + "grad_norm": 1.4305381774902344, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.893939197063446, + "num_tokens": 307541042.0, + "step": 8061 + }, + { + "epoch": 1.0255692659966926, + "grad_norm": 1.5025596618652344, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8720235228538513, + "num_tokens": 307581957.0, + "step": 8062 + }, + { + "epoch": 1.0256964762752832, + "grad_norm": 1.6135164499282837, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8586212396621704, + "num_tokens": 307616180.0, + "step": 8063 + }, + { + "epoch": 1.0258236865538735, + "grad_norm": 1.4782583713531494, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8648474216461182, + "num_tokens": 307657807.0, + "step": 8064 + }, + { + "epoch": 1.025950896832464, + "grad_norm": 1.619850993156433, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8811139464378357, + "num_tokens": 307689768.0, + "step": 8065 + }, + { + "epoch": 1.0260781071110545, + "grad_norm": 1.5823352336883545, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8867321014404297, + "num_tokens": 307722176.0, + "step": 8066 + }, + { + "epoch": 1.026205317389645, + "grad_norm": 1.6782035827636719, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.866578996181488, + "num_tokens": 307758986.0, + "step": 8067 + }, + { + "epoch": 1.0263325276682356, + "grad_norm": 1.4781217575073242, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8771926164627075, + "num_tokens": 307796299.0, + "step": 8068 + }, + { + "epoch": 1.026459737946826, + "grad_norm": 1.439314842224121, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8919059038162231, + "num_tokens": 307835016.0, + "step": 8069 + }, + { + "epoch": 1.0265869482254166, + "grad_norm": 1.550376534461975, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8725931644439697, + "num_tokens": 307872071.0, + "step": 8070 + }, + { + "epoch": 1.0267141585040072, + "grad_norm": 1.459897756576538, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8735734224319458, + "num_tokens": 307909181.0, + "step": 8071 + }, + { + "epoch": 1.0268413687825977, + "grad_norm": 1.5565979480743408, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8681090474128723, + "num_tokens": 307946906.0, + "step": 8072 + }, + { + "epoch": 1.0269685790611882, + "grad_norm": 1.5685386657714844, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8854379057884216, + "num_tokens": 307980386.0, + "step": 8073 + }, + { + "epoch": 1.0270957893397787, + "grad_norm": 1.4054157733917236, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8737870454788208, + "num_tokens": 308021690.0, + "step": 8074 + }, + { + "epoch": 1.0272229996183693, + "grad_norm": 1.4319905042648315, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8802865743637085, + "num_tokens": 308059723.0, + "step": 8075 + }, + { + "epoch": 1.0273502098969596, + "grad_norm": 1.5432989597320557, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8795673847198486, + "num_tokens": 308100514.0, + "step": 8076 + }, + { + "epoch": 1.02747742017555, + "grad_norm": 1.5891361236572266, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8694230318069458, + "num_tokens": 308135521.0, + "step": 8077 + }, + { + "epoch": 1.0276046304541406, + "grad_norm": 1.499839425086975, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8748114109039307, + "num_tokens": 308174008.0, + "step": 8078 + }, + { + "epoch": 1.0277318407327312, + "grad_norm": 1.4835848808288574, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8748660087585449, + "num_tokens": 308214504.0, + "step": 8079 + }, + { + "epoch": 1.0278590510113217, + "grad_norm": 1.4773874282836914, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8713582754135132, + "num_tokens": 308255878.0, + "step": 8080 + }, + { + "epoch": 1.0279862612899122, + "grad_norm": 1.424390196800232, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8756767511367798, + "num_tokens": 308297889.0, + "step": 8081 + }, + { + "epoch": 1.0281134715685027, + "grad_norm": 1.442167043685913, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8704470992088318, + "num_tokens": 308340515.0, + "step": 8082 + }, + { + "epoch": 1.0282406818470933, + "grad_norm": 1.574053406715393, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8840382099151611, + "num_tokens": 308378869.0, + "step": 8083 + }, + { + "epoch": 1.0283678921256838, + "grad_norm": 1.4798939228057861, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8854049444198608, + "num_tokens": 308417531.0, + "step": 8084 + }, + { + "epoch": 1.0284951024042743, + "grad_norm": 1.49903404712677, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8822289705276489, + "num_tokens": 308452559.0, + "step": 8085 + }, + { + "epoch": 1.0286223126828649, + "grad_norm": 1.4813871383666992, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8761835098266602, + "num_tokens": 308491375.0, + "step": 8086 + }, + { + "epoch": 1.0287495229614554, + "grad_norm": 1.608758568763733, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8752743601799011, + "num_tokens": 308526611.0, + "step": 8087 + }, + { + "epoch": 1.0288767332400457, + "grad_norm": 1.5098719596862793, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8668497204780579, + "num_tokens": 308567298.0, + "step": 8088 + }, + { + "epoch": 1.0290039435186362, + "grad_norm": 1.6615875959396362, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8719232082366943, + "num_tokens": 308605925.0, + "step": 8089 + }, + { + "epoch": 1.0291311537972267, + "grad_norm": 1.4667418003082275, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8712144494056702, + "num_tokens": 308648031.0, + "step": 8090 + }, + { + "epoch": 1.0292583640758173, + "grad_norm": 1.4969476461410522, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.887170672416687, + "num_tokens": 308683477.0, + "step": 8091 + }, + { + "epoch": 1.0293855743544078, + "grad_norm": 1.3625297546386719, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8782123327255249, + "num_tokens": 308725859.0, + "step": 8092 + }, + { + "epoch": 1.0295127846329983, + "grad_norm": 1.5267308950424194, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8752643465995789, + "num_tokens": 308763242.0, + "step": 8093 + }, + { + "epoch": 1.0296399949115889, + "grad_norm": 1.6022140979766846, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8687582015991211, + "num_tokens": 308800687.0, + "step": 8094 + }, + { + "epoch": 1.0297672051901794, + "grad_norm": 1.512239694595337, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.876388430595398, + "num_tokens": 308842883.0, + "step": 8095 + }, + { + "epoch": 1.02989441546877, + "grad_norm": 1.5795519351959229, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.873478889465332, + "num_tokens": 308878216.0, + "step": 8096 + }, + { + "epoch": 1.0300216257473604, + "grad_norm": 1.514757513999939, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8754558563232422, + "num_tokens": 308918904.0, + "step": 8097 + }, + { + "epoch": 1.030148836025951, + "grad_norm": 1.5682854652404785, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8800346255302429, + "num_tokens": 308953720.0, + "step": 8098 + }, + { + "epoch": 1.0302760463045415, + "grad_norm": 1.5815967321395874, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8843429684638977, + "num_tokens": 308985265.0, + "step": 8099 + }, + { + "epoch": 1.0304032565831318, + "grad_norm": 1.5720881223678589, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8533895015716553, + "num_tokens": 309023324.0, + "step": 8100 + }, + { + "epoch": 1.0305304668617223, + "grad_norm": 1.4685747623443604, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8724365234375, + "num_tokens": 309064472.0, + "step": 8101 + }, + { + "epoch": 1.0306576771403129, + "grad_norm": 1.4867017269134521, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.863776683807373, + "num_tokens": 309104409.0, + "step": 8102 + }, + { + "epoch": 1.0307848874189034, + "grad_norm": 1.4494918584823608, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8907835483551025, + "num_tokens": 309144692.0, + "step": 8103 + }, + { + "epoch": 1.030912097697494, + "grad_norm": 1.4464553594589233, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8853314518928528, + "num_tokens": 309185688.0, + "step": 8104 + }, + { + "epoch": 1.0310393079760845, + "grad_norm": 1.4857367277145386, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8782544732093811, + "num_tokens": 309224982.0, + "step": 8105 + }, + { + "epoch": 1.031166518254675, + "grad_norm": 1.5735279321670532, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.870725154876709, + "num_tokens": 309263395.0, + "step": 8106 + }, + { + "epoch": 1.0312937285332655, + "grad_norm": 1.515348196029663, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8789505958557129, + "num_tokens": 309302835.0, + "step": 8107 + }, + { + "epoch": 1.031420938811856, + "grad_norm": 1.4384870529174805, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8848192691802979, + "num_tokens": 309341784.0, + "step": 8108 + }, + { + "epoch": 1.0315481490904466, + "grad_norm": 1.4265142679214478, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8781102299690247, + "num_tokens": 309383432.0, + "step": 8109 + }, + { + "epoch": 1.031675359369037, + "grad_norm": 1.4909411668777466, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8780168294906616, + "num_tokens": 309420950.0, + "step": 8110 + }, + { + "epoch": 1.0318025696476276, + "grad_norm": 1.6536959409713745, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8560401797294617, + "num_tokens": 309456707.0, + "step": 8111 + }, + { + "epoch": 1.0319297799262181, + "grad_norm": 1.6590529680252075, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8818625807762146, + "num_tokens": 309490531.0, + "step": 8112 + }, + { + "epoch": 1.0320569902048085, + "grad_norm": 1.6224794387817383, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8749245405197144, + "num_tokens": 309524923.0, + "step": 8113 + }, + { + "epoch": 1.032184200483399, + "grad_norm": 1.4164046049118042, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8936959505081177, + "num_tokens": 309562769.0, + "step": 8114 + }, + { + "epoch": 1.0323114107619895, + "grad_norm": 1.5830811262130737, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8761447668075562, + "num_tokens": 309600046.0, + "step": 8115 + }, + { + "epoch": 1.03243862104058, + "grad_norm": 1.5155634880065918, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8624280691146851, + "num_tokens": 309640638.0, + "step": 8116 + }, + { + "epoch": 1.0325658313191706, + "grad_norm": 1.5967458486557007, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8891870975494385, + "num_tokens": 309673256.0, + "step": 8117 + }, + { + "epoch": 1.032693041597761, + "grad_norm": 1.4401888847351074, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8680546283721924, + "num_tokens": 309712703.0, + "step": 8118 + }, + { + "epoch": 1.0328202518763516, + "grad_norm": 1.6176785230636597, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8628064393997192, + "num_tokens": 309749504.0, + "step": 8119 + }, + { + "epoch": 1.0329474621549422, + "grad_norm": 1.6079010963439941, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8671536445617676, + "num_tokens": 309783834.0, + "step": 8120 + }, + { + "epoch": 1.0330746724335327, + "grad_norm": 1.383609652519226, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8735756874084473, + "num_tokens": 309826974.0, + "step": 8121 + }, + { + "epoch": 1.0332018827121232, + "grad_norm": 1.4790898561477661, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8817092180252075, + "num_tokens": 309866950.0, + "step": 8122 + }, + { + "epoch": 1.0333290929907137, + "grad_norm": 1.5083283185958862, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8784685134887695, + "num_tokens": 309905343.0, + "step": 8123 + }, + { + "epoch": 1.0334563032693043, + "grad_norm": 1.4674100875854492, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8615444898605347, + "num_tokens": 309947171.0, + "step": 8124 + }, + { + "epoch": 1.0335835135478946, + "grad_norm": 1.6548449993133545, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8676302433013916, + "num_tokens": 309979721.0, + "step": 8125 + }, + { + "epoch": 1.033710723826485, + "grad_norm": 1.4276232719421387, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.880428671836853, + "num_tokens": 310017060.0, + "step": 8126 + }, + { + "epoch": 1.0338379341050756, + "grad_norm": 1.5892153978347778, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8799493312835693, + "num_tokens": 310049599.0, + "step": 8127 + }, + { + "epoch": 1.0339651443836662, + "grad_norm": 1.606570839881897, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8693550825119019, + "num_tokens": 310084841.0, + "step": 8128 + }, + { + "epoch": 1.0340923546622567, + "grad_norm": 1.412409782409668, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.867946982383728, + "num_tokens": 310130433.0, + "step": 8129 + }, + { + "epoch": 1.0342195649408472, + "grad_norm": 1.5780713558197021, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8633145093917847, + "num_tokens": 310170716.0, + "step": 8130 + }, + { + "epoch": 1.0343467752194377, + "grad_norm": 1.5548683404922485, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8785467147827148, + "num_tokens": 310206493.0, + "step": 8131 + }, + { + "epoch": 1.0344739854980283, + "grad_norm": 1.4437365531921387, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.864844560623169, + "num_tokens": 310250806.0, + "step": 8132 + }, + { + "epoch": 1.0346011957766188, + "grad_norm": 1.390187382698059, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8841500878334045, + "num_tokens": 310292346.0, + "step": 8133 + }, + { + "epoch": 1.0347284060552093, + "grad_norm": 1.4364274740219116, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8751766681671143, + "num_tokens": 310333503.0, + "step": 8134 + }, + { + "epoch": 1.0348556163337999, + "grad_norm": 1.5695979595184326, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8606610298156738, + "num_tokens": 310372818.0, + "step": 8135 + }, + { + "epoch": 1.0349828266123904, + "grad_norm": 1.6110608577728271, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8681536912918091, + "num_tokens": 310410031.0, + "step": 8136 + }, + { + "epoch": 1.0351100368909807, + "grad_norm": 1.5711469650268555, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8727750778198242, + "num_tokens": 310453014.0, + "step": 8137 + }, + { + "epoch": 1.0352372471695712, + "grad_norm": 1.6463406085968018, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.870394766330719, + "num_tokens": 310487631.0, + "step": 8138 + }, + { + "epoch": 1.0353644574481617, + "grad_norm": 1.434219479560852, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8742706775665283, + "num_tokens": 310533285.0, + "step": 8139 + }, + { + "epoch": 1.0354916677267523, + "grad_norm": 1.5155049562454224, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8660557270050049, + "num_tokens": 310571787.0, + "step": 8140 + }, + { + "epoch": 1.0356188780053428, + "grad_norm": 1.6079778671264648, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8666673898696899, + "num_tokens": 310610937.0, + "step": 8141 + }, + { + "epoch": 1.0357460882839333, + "grad_norm": 1.7272580862045288, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.856166660785675, + "num_tokens": 310646277.0, + "step": 8142 + }, + { + "epoch": 1.0358732985625239, + "grad_norm": 1.5249674320220947, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8881868124008179, + "num_tokens": 310680358.0, + "step": 8143 + }, + { + "epoch": 1.0360005088411144, + "grad_norm": 1.5861032009124756, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8627058267593384, + "num_tokens": 310718469.0, + "step": 8144 + }, + { + "epoch": 1.036127719119705, + "grad_norm": 1.4481372833251953, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8787981271743774, + "num_tokens": 310759619.0, + "step": 8145 + }, + { + "epoch": 1.0362549293982954, + "grad_norm": 1.4606564044952393, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8920583128929138, + "num_tokens": 310800514.0, + "step": 8146 + }, + { + "epoch": 1.036382139676886, + "grad_norm": 1.469132900238037, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8752294778823853, + "num_tokens": 310842005.0, + "step": 8147 + }, + { + "epoch": 1.0365093499554765, + "grad_norm": 1.8025468587875366, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8726916313171387, + "num_tokens": 310873532.0, + "step": 8148 + }, + { + "epoch": 1.0366365602340668, + "grad_norm": 1.4869320392608643, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8625451326370239, + "num_tokens": 310913839.0, + "step": 8149 + }, + { + "epoch": 1.0367637705126573, + "grad_norm": 1.5715322494506836, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8663167357444763, + "num_tokens": 310952975.0, + "step": 8150 + }, + { + "epoch": 1.0368909807912479, + "grad_norm": 1.4743722677230835, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8806167244911194, + "num_tokens": 310992806.0, + "step": 8151 + }, + { + "epoch": 1.0370181910698384, + "grad_norm": 1.4866682291030884, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8749212026596069, + "num_tokens": 311028208.0, + "step": 8152 + }, + { + "epoch": 1.037145401348429, + "grad_norm": 1.6067099571228027, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.883437991142273, + "num_tokens": 311064498.0, + "step": 8153 + }, + { + "epoch": 1.0372726116270194, + "grad_norm": 1.5371336936950684, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8646819591522217, + "num_tokens": 311102968.0, + "step": 8154 + }, + { + "epoch": 1.03739982190561, + "grad_norm": 1.6349270343780518, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.868873119354248, + "num_tokens": 311136382.0, + "step": 8155 + }, + { + "epoch": 1.0375270321842005, + "grad_norm": 1.5188802480697632, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.874163031578064, + "num_tokens": 311175824.0, + "step": 8156 + }, + { + "epoch": 1.037654242462791, + "grad_norm": 1.4689587354660034, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8801168203353882, + "num_tokens": 311212377.0, + "step": 8157 + }, + { + "epoch": 1.0377814527413816, + "grad_norm": 1.578628659248352, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8712446689605713, + "num_tokens": 311247772.0, + "step": 8158 + }, + { + "epoch": 1.037908663019972, + "grad_norm": 1.4659559726715088, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8742054104804993, + "num_tokens": 311288462.0, + "step": 8159 + }, + { + "epoch": 1.0380358732985626, + "grad_norm": 1.3986694812774658, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8907727003097534, + "num_tokens": 311330506.0, + "step": 8160 + }, + { + "epoch": 1.0381630835771531, + "grad_norm": 1.4422587156295776, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8842323422431946, + "num_tokens": 311367710.0, + "step": 8161 + }, + { + "epoch": 1.0382902938557435, + "grad_norm": 1.419471263885498, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8845884799957275, + "num_tokens": 311409065.0, + "step": 8162 + }, + { + "epoch": 1.038417504134334, + "grad_norm": 1.532222867012024, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8729425072669983, + "num_tokens": 311444227.0, + "step": 8163 + }, + { + "epoch": 1.0385447144129245, + "grad_norm": 1.489936113357544, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8829512000083923, + "num_tokens": 311483393.0, + "step": 8164 + }, + { + "epoch": 1.038671924691515, + "grad_norm": 1.4413269758224487, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8836003541946411, + "num_tokens": 311524112.0, + "step": 8165 + }, + { + "epoch": 1.0387991349701056, + "grad_norm": 1.5494353771209717, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8672806024551392, + "num_tokens": 311560865.0, + "step": 8166 + }, + { + "epoch": 1.038926345248696, + "grad_norm": 1.5021144151687622, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8672381639480591, + "num_tokens": 311600637.0, + "step": 8167 + }, + { + "epoch": 1.0390535555272866, + "grad_norm": 1.6189754009246826, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8821910619735718, + "num_tokens": 311633216.0, + "step": 8168 + }, + { + "epoch": 1.0391807658058771, + "grad_norm": 1.5415562391281128, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8621609807014465, + "num_tokens": 311671361.0, + "step": 8169 + }, + { + "epoch": 1.0393079760844677, + "grad_norm": 1.4770359992980957, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8728427886962891, + "num_tokens": 311711119.0, + "step": 8170 + }, + { + "epoch": 1.0394351863630582, + "grad_norm": 1.5776233673095703, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8684232234954834, + "num_tokens": 311749710.0, + "step": 8171 + }, + { + "epoch": 1.0395623966416487, + "grad_norm": 1.412266731262207, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8824722170829773, + "num_tokens": 311791614.0, + "step": 8172 + }, + { + "epoch": 1.0396896069202393, + "grad_norm": 1.411737084388733, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8775122165679932, + "num_tokens": 311832898.0, + "step": 8173 + }, + { + "epoch": 1.0398168171988296, + "grad_norm": 1.4556350708007812, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8730957508087158, + "num_tokens": 311874984.0, + "step": 8174 + }, + { + "epoch": 1.03994402747742, + "grad_norm": 1.6261699199676514, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8747233152389526, + "num_tokens": 311908653.0, + "step": 8175 + }, + { + "epoch": 1.0400712377560106, + "grad_norm": 1.4305495023727417, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8863813877105713, + "num_tokens": 311947807.0, + "step": 8176 + }, + { + "epoch": 1.0401984480346012, + "grad_norm": 1.5692230463027954, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8695653676986694, + "num_tokens": 311985041.0, + "step": 8177 + }, + { + "epoch": 1.0403256583131917, + "grad_norm": 1.592340350151062, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.867131233215332, + "num_tokens": 312021551.0, + "step": 8178 + }, + { + "epoch": 1.0404528685917822, + "grad_norm": 1.4579808712005615, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8574309945106506, + "num_tokens": 312064678.0, + "step": 8179 + }, + { + "epoch": 1.0405800788703727, + "grad_norm": 1.628208875656128, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8712738156318665, + "num_tokens": 312099581.0, + "step": 8180 + }, + { + "epoch": 1.0407072891489633, + "grad_norm": 1.657480239868164, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8771597146987915, + "num_tokens": 312131533.0, + "step": 8181 + }, + { + "epoch": 1.0408344994275538, + "grad_norm": 1.5163674354553223, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8595800995826721, + "num_tokens": 312174383.0, + "step": 8182 + }, + { + "epoch": 1.0409617097061443, + "grad_norm": 1.676194667816162, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8610948324203491, + "num_tokens": 312213515.0, + "step": 8183 + }, + { + "epoch": 1.0410889199847349, + "grad_norm": 1.4838817119598389, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8790556192398071, + "num_tokens": 312253168.0, + "step": 8184 + }, + { + "epoch": 1.0412161302633254, + "grad_norm": 1.5383899211883545, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.854801595211029, + "num_tokens": 312295409.0, + "step": 8185 + }, + { + "epoch": 1.0413433405419157, + "grad_norm": 1.3692047595977783, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8890672922134399, + "num_tokens": 312336914.0, + "step": 8186 + }, + { + "epoch": 1.0414705508205062, + "grad_norm": 1.516139030456543, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8766035437583923, + "num_tokens": 312379660.0, + "step": 8187 + }, + { + "epoch": 1.0415977610990967, + "grad_norm": 1.5057978630065918, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8918794393539429, + "num_tokens": 312414960.0, + "step": 8188 + }, + { + "epoch": 1.0417249713776873, + "grad_norm": 1.5544201135635376, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8610171675682068, + "num_tokens": 312451503.0, + "step": 8189 + }, + { + "epoch": 1.0418521816562778, + "grad_norm": 1.4931566715240479, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8884926438331604, + "num_tokens": 312486105.0, + "step": 8190 + }, + { + "epoch": 1.0419793919348683, + "grad_norm": 1.5040841102600098, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8802887797355652, + "num_tokens": 312522520.0, + "step": 8191 + }, + { + "epoch": 1.0421066022134589, + "grad_norm": 1.544712781906128, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.885387659072876, + "num_tokens": 312560063.0, + "step": 8192 + }, + { + "epoch": 1.0422338124920494, + "grad_norm": 1.4088501930236816, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8883197903633118, + "num_tokens": 312603613.0, + "step": 8193 + }, + { + "epoch": 1.04236102277064, + "grad_norm": 6.139959335327148, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8682465553283691, + "num_tokens": 312646028.0, + "step": 8194 + }, + { + "epoch": 1.0424882330492304, + "grad_norm": 1.5166547298431396, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8840140104293823, + "num_tokens": 312681747.0, + "step": 8195 + }, + { + "epoch": 1.042615443327821, + "grad_norm": 1.565717339515686, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8823333978652954, + "num_tokens": 312718874.0, + "step": 8196 + }, + { + "epoch": 1.0427426536064115, + "grad_norm": 1.4517065286636353, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8637725114822388, + "num_tokens": 312761565.0, + "step": 8197 + }, + { + "epoch": 1.0428698638850018, + "grad_norm": 1.5598503351211548, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8746283054351807, + "num_tokens": 312798484.0, + "step": 8198 + }, + { + "epoch": 1.0429970741635923, + "grad_norm": 1.6261318922042847, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.864008903503418, + "num_tokens": 312835403.0, + "step": 8199 + }, + { + "epoch": 1.0431242844421829, + "grad_norm": 1.623870611190796, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8734121322631836, + "num_tokens": 312870970.0, + "step": 8200 + }, + { + "epoch": 1.0432514947207734, + "grad_norm": 1.4502758979797363, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.881123423576355, + "num_tokens": 312911520.0, + "step": 8201 + }, + { + "epoch": 1.043378704999364, + "grad_norm": 1.5333694219589233, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8863959312438965, + "num_tokens": 312947221.0, + "step": 8202 + }, + { + "epoch": 1.0435059152779544, + "grad_norm": 1.6165107488632202, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8811395168304443, + "num_tokens": 312986634.0, + "step": 8203 + }, + { + "epoch": 1.043633125556545, + "grad_norm": 1.497190237045288, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8804508447647095, + "num_tokens": 313023248.0, + "step": 8204 + }, + { + "epoch": 1.0437603358351355, + "grad_norm": 1.4905601739883423, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.872883141040802, + "num_tokens": 313064245.0, + "step": 8205 + }, + { + "epoch": 1.043887546113726, + "grad_norm": 1.5934990644454956, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8771960735321045, + "num_tokens": 313098352.0, + "step": 8206 + }, + { + "epoch": 1.0440147563923166, + "grad_norm": 1.526017427444458, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8864318132400513, + "num_tokens": 313134346.0, + "step": 8207 + }, + { + "epoch": 1.044141966670907, + "grad_norm": 1.5259069204330444, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8803284764289856, + "num_tokens": 313171095.0, + "step": 8208 + }, + { + "epoch": 1.0442691769494976, + "grad_norm": 1.609233021736145, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8727912306785583, + "num_tokens": 313208001.0, + "step": 8209 + }, + { + "epoch": 1.0443963872280881, + "grad_norm": 1.6081328392028809, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.881281316280365, + "num_tokens": 313240029.0, + "step": 8210 + }, + { + "epoch": 1.0445235975066784, + "grad_norm": 1.5516630411148071, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8716301918029785, + "num_tokens": 313274684.0, + "step": 8211 + }, + { + "epoch": 1.044650807785269, + "grad_norm": 1.5017905235290527, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8781776428222656, + "num_tokens": 313318399.0, + "step": 8212 + }, + { + "epoch": 1.0447780180638595, + "grad_norm": 1.4581557512283325, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8839011192321777, + "num_tokens": 313356108.0, + "step": 8213 + }, + { + "epoch": 1.04490522834245, + "grad_norm": 1.3577232360839844, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8831886053085327, + "num_tokens": 313397231.0, + "step": 8214 + }, + { + "epoch": 1.0450324386210406, + "grad_norm": 1.545182228088379, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8818708658218384, + "num_tokens": 313430951.0, + "step": 8215 + }, + { + "epoch": 1.045159648899631, + "grad_norm": 1.4610682725906372, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8793854117393494, + "num_tokens": 313469455.0, + "step": 8216 + }, + { + "epoch": 1.0452868591782216, + "grad_norm": 1.5710909366607666, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8668345808982849, + "num_tokens": 313505845.0, + "step": 8217 + }, + { + "epoch": 1.0454140694568121, + "grad_norm": 1.415741205215454, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8654256463050842, + "num_tokens": 313547401.0, + "step": 8218 + }, + { + "epoch": 1.0455412797354027, + "grad_norm": 1.470680832862854, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8802753686904907, + "num_tokens": 313586596.0, + "step": 8219 + }, + { + "epoch": 1.0456684900139932, + "grad_norm": 1.3963817358016968, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8797078728675842, + "num_tokens": 313629583.0, + "step": 8220 + }, + { + "epoch": 1.0457957002925837, + "grad_norm": 1.4833937883377075, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8765616416931152, + "num_tokens": 313665549.0, + "step": 8221 + }, + { + "epoch": 1.0459229105711743, + "grad_norm": 1.452264666557312, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8737964630126953, + "num_tokens": 313705991.0, + "step": 8222 + }, + { + "epoch": 1.0460501208497646, + "grad_norm": 1.6141927242279053, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8765131831169128, + "num_tokens": 313739308.0, + "step": 8223 + }, + { + "epoch": 1.046177331128355, + "grad_norm": 1.5383118391036987, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8791388273239136, + "num_tokens": 313773638.0, + "step": 8224 + }, + { + "epoch": 1.0463045414069456, + "grad_norm": 1.566812515258789, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8602628707885742, + "num_tokens": 313810781.0, + "step": 8225 + }, + { + "epoch": 1.0464317516855361, + "grad_norm": 1.4136912822723389, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8816986083984375, + "num_tokens": 313851925.0, + "step": 8226 + }, + { + "epoch": 1.0465589619641267, + "grad_norm": 1.3803561925888062, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8806502819061279, + "num_tokens": 313896304.0, + "step": 8227 + }, + { + "epoch": 1.0466861722427172, + "grad_norm": 1.3952733278274536, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8832803964614868, + "num_tokens": 313939484.0, + "step": 8228 + }, + { + "epoch": 1.0468133825213077, + "grad_norm": 1.4458611011505127, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8812737464904785, + "num_tokens": 313977739.0, + "step": 8229 + }, + { + "epoch": 1.0469405927998983, + "grad_norm": 1.5151475667953491, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8788751363754272, + "num_tokens": 314020895.0, + "step": 8230 + }, + { + "epoch": 1.0470678030784888, + "grad_norm": 1.493039846420288, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8659156560897827, + "num_tokens": 314064972.0, + "step": 8231 + }, + { + "epoch": 1.0471950133570793, + "grad_norm": 1.521356463432312, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8750318884849548, + "num_tokens": 314104935.0, + "step": 8232 + }, + { + "epoch": 1.0473222236356698, + "grad_norm": 1.4686827659606934, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8863061666488647, + "num_tokens": 314140576.0, + "step": 8233 + }, + { + "epoch": 1.0474494339142604, + "grad_norm": 1.4805095195770264, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8667376041412354, + "num_tokens": 314181274.0, + "step": 8234 + }, + { + "epoch": 1.0475766441928507, + "grad_norm": 1.508576512336731, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.880251944065094, + "num_tokens": 314220688.0, + "step": 8235 + }, + { + "epoch": 1.0477038544714412, + "grad_norm": 1.3569780588150024, + "learning_rate": 1e-06, + "loss": 0.2738, + "mean_token_accuracy": 0.9024573564529419, + "num_tokens": 314259355.0, + "step": 8236 + }, + { + "epoch": 1.0478310647500317, + "grad_norm": 1.5507982969284058, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8809552788734436, + "num_tokens": 314295190.0, + "step": 8237 + }, + { + "epoch": 1.0479582750286223, + "grad_norm": 1.5078864097595215, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8708269596099854, + "num_tokens": 314332926.0, + "step": 8238 + }, + { + "epoch": 1.0480854853072128, + "grad_norm": 1.4447462558746338, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8495256900787354, + "num_tokens": 314377731.0, + "step": 8239 + }, + { + "epoch": 1.0482126955858033, + "grad_norm": 1.6012802124023438, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8613672256469727, + "num_tokens": 314412745.0, + "step": 8240 + }, + { + "epoch": 1.0483399058643939, + "grad_norm": 1.6001091003417969, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8797351717948914, + "num_tokens": 314448101.0, + "step": 8241 + }, + { + "epoch": 1.0484671161429844, + "grad_norm": 1.528968334197998, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8758479952812195, + "num_tokens": 314486152.0, + "step": 8242 + }, + { + "epoch": 1.048594326421575, + "grad_norm": 1.5184108018875122, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8727002143859863, + "num_tokens": 314523690.0, + "step": 8243 + }, + { + "epoch": 1.0487215367001654, + "grad_norm": 1.4848867654800415, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8847295641899109, + "num_tokens": 314562454.0, + "step": 8244 + }, + { + "epoch": 1.048848746978756, + "grad_norm": 1.5286246538162231, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.880268931388855, + "num_tokens": 314598358.0, + "step": 8245 + }, + { + "epoch": 1.0489759572573465, + "grad_norm": 1.5940141677856445, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8571761846542358, + "num_tokens": 314634924.0, + "step": 8246 + }, + { + "epoch": 1.0491031675359368, + "grad_norm": 1.5185439586639404, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8724961876869202, + "num_tokens": 314675497.0, + "step": 8247 + }, + { + "epoch": 1.0492303778145273, + "grad_norm": 1.5916521549224854, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8756831884384155, + "num_tokens": 314712357.0, + "step": 8248 + }, + { + "epoch": 1.0493575880931179, + "grad_norm": 1.6217823028564453, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8775162696838379, + "num_tokens": 314747234.0, + "step": 8249 + }, + { + "epoch": 1.0494847983717084, + "grad_norm": 1.463310718536377, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8643316030502319, + "num_tokens": 314786980.0, + "step": 8250 + }, + { + "epoch": 1.049612008650299, + "grad_norm": 1.5882484912872314, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8667921423912048, + "num_tokens": 314820915.0, + "step": 8251 + }, + { + "epoch": 1.0497392189288894, + "grad_norm": 1.4356471300125122, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8791432976722717, + "num_tokens": 314860256.0, + "step": 8252 + }, + { + "epoch": 1.04986642920748, + "grad_norm": 1.5590859651565552, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8657162189483643, + "num_tokens": 314895518.0, + "step": 8253 + }, + { + "epoch": 1.0499936394860705, + "grad_norm": 1.5367156267166138, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.87530517578125, + "num_tokens": 314930995.0, + "step": 8254 + }, + { + "epoch": 1.050120849764661, + "grad_norm": 1.5877236127853394, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8657137155532837, + "num_tokens": 314967380.0, + "step": 8255 + }, + { + "epoch": 1.0502480600432516, + "grad_norm": 1.5444269180297852, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8754056096076965, + "num_tokens": 315004935.0, + "step": 8256 + }, + { + "epoch": 1.050375270321842, + "grad_norm": 1.5177000761032104, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8833470344543457, + "num_tokens": 315039670.0, + "step": 8257 + }, + { + "epoch": 1.0505024806004326, + "grad_norm": 1.6103934049606323, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8825646638870239, + "num_tokens": 315073648.0, + "step": 8258 + }, + { + "epoch": 1.0506296908790231, + "grad_norm": 1.4682778120040894, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8728543519973755, + "num_tokens": 315114293.0, + "step": 8259 + }, + { + "epoch": 1.0507569011576134, + "grad_norm": 1.6472630500793457, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8694060444831848, + "num_tokens": 315151085.0, + "step": 8260 + }, + { + "epoch": 1.050884111436204, + "grad_norm": 1.6125354766845703, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8640073537826538, + "num_tokens": 315189662.0, + "step": 8261 + }, + { + "epoch": 1.0510113217147945, + "grad_norm": 1.4426906108856201, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8872168660163879, + "num_tokens": 315228333.0, + "step": 8262 + }, + { + "epoch": 1.051138531993385, + "grad_norm": 1.643453598022461, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8766515254974365, + "num_tokens": 315262974.0, + "step": 8263 + }, + { + "epoch": 1.0512657422719756, + "grad_norm": 1.598386526107788, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8780626654624939, + "num_tokens": 315296998.0, + "step": 8264 + }, + { + "epoch": 1.051392952550566, + "grad_norm": 1.5108060836791992, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.880858838558197, + "num_tokens": 315333729.0, + "step": 8265 + }, + { + "epoch": 1.0515201628291566, + "grad_norm": 1.590983510017395, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8708584308624268, + "num_tokens": 315369328.0, + "step": 8266 + }, + { + "epoch": 1.0516473731077471, + "grad_norm": 1.557442545890808, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8766756057739258, + "num_tokens": 315403946.0, + "step": 8267 + }, + { + "epoch": 1.0517745833863377, + "grad_norm": 1.5085875988006592, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.88128662109375, + "num_tokens": 315441918.0, + "step": 8268 + }, + { + "epoch": 1.0519017936649282, + "grad_norm": 1.6462064981460571, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8663330078125, + "num_tokens": 315476560.0, + "step": 8269 + }, + { + "epoch": 1.0520290039435187, + "grad_norm": 1.4896876811981201, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8750036358833313, + "num_tokens": 315519118.0, + "step": 8270 + }, + { + "epoch": 1.0521562142221093, + "grad_norm": 1.6632388830184937, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8657143712043762, + "num_tokens": 315553549.0, + "step": 8271 + }, + { + "epoch": 1.0522834245006996, + "grad_norm": 1.4879564046859741, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8855715394020081, + "num_tokens": 315591546.0, + "step": 8272 + }, + { + "epoch": 1.05241063477929, + "grad_norm": 1.5244061946868896, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8688819408416748, + "num_tokens": 315631803.0, + "step": 8273 + }, + { + "epoch": 1.0525378450578806, + "grad_norm": 1.4592955112457275, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8802151679992676, + "num_tokens": 315671000.0, + "step": 8274 + }, + { + "epoch": 1.0526650553364711, + "grad_norm": 1.4917150735855103, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8792567253112793, + "num_tokens": 315706998.0, + "step": 8275 + }, + { + "epoch": 1.0527922656150617, + "grad_norm": 1.5631762742996216, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8716015815734863, + "num_tokens": 315747293.0, + "step": 8276 + }, + { + "epoch": 1.0529194758936522, + "grad_norm": 1.3024383783340454, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.879576563835144, + "num_tokens": 315793849.0, + "step": 8277 + }, + { + "epoch": 1.0530466861722427, + "grad_norm": 1.6826558113098145, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8658616542816162, + "num_tokens": 315828515.0, + "step": 8278 + }, + { + "epoch": 1.0531738964508333, + "grad_norm": 1.5705715417861938, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8678888082504272, + "num_tokens": 315864252.0, + "step": 8279 + }, + { + "epoch": 1.0533011067294238, + "grad_norm": 1.49573814868927, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8799075484275818, + "num_tokens": 315903179.0, + "step": 8280 + }, + { + "epoch": 1.0534283170080143, + "grad_norm": 1.472967505455017, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8611712455749512, + "num_tokens": 315946267.0, + "step": 8281 + }, + { + "epoch": 1.0535555272866048, + "grad_norm": 1.555201530456543, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8724346160888672, + "num_tokens": 315982055.0, + "step": 8282 + }, + { + "epoch": 1.0536827375651954, + "grad_norm": 1.52721107006073, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8842002153396606, + "num_tokens": 316018038.0, + "step": 8283 + }, + { + "epoch": 1.0538099478437857, + "grad_norm": 1.3595921993255615, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.882860541343689, + "num_tokens": 316061370.0, + "step": 8284 + }, + { + "epoch": 1.0539371581223762, + "grad_norm": 1.473922848701477, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8685773015022278, + "num_tokens": 316102764.0, + "step": 8285 + }, + { + "epoch": 1.0540643684009667, + "grad_norm": 1.463731288909912, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.87994384765625, + "num_tokens": 316141352.0, + "step": 8286 + }, + { + "epoch": 1.0541915786795573, + "grad_norm": 1.4264215230941772, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.884201169013977, + "num_tokens": 316180106.0, + "step": 8287 + }, + { + "epoch": 1.0543187889581478, + "grad_norm": 1.336059331893921, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8851834535598755, + "num_tokens": 316222017.0, + "step": 8288 + }, + { + "epoch": 1.0544459992367383, + "grad_norm": 1.343269944190979, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8673269152641296, + "num_tokens": 316267888.0, + "step": 8289 + }, + { + "epoch": 1.0545732095153288, + "grad_norm": 1.5110604763031006, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8728725910186768, + "num_tokens": 316307668.0, + "step": 8290 + }, + { + "epoch": 1.0547004197939194, + "grad_norm": 1.470649003982544, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8796510696411133, + "num_tokens": 316346485.0, + "step": 8291 + }, + { + "epoch": 1.05482763007251, + "grad_norm": 1.6665964126586914, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8858283758163452, + "num_tokens": 316378865.0, + "step": 8292 + }, + { + "epoch": 1.0549548403511004, + "grad_norm": 1.562037467956543, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8709564805030823, + "num_tokens": 316417600.0, + "step": 8293 + }, + { + "epoch": 1.055082050629691, + "grad_norm": 1.5276497602462769, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8840302228927612, + "num_tokens": 316453754.0, + "step": 8294 + }, + { + "epoch": 1.0552092609082815, + "grad_norm": 1.6469324827194214, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8802398443222046, + "num_tokens": 316485043.0, + "step": 8295 + }, + { + "epoch": 1.0553364711868718, + "grad_norm": 1.672652006149292, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8888083100318909, + "num_tokens": 316520097.0, + "step": 8296 + }, + { + "epoch": 1.0554636814654623, + "grad_norm": 1.5877103805541992, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.863632321357727, + "num_tokens": 316559314.0, + "step": 8297 + }, + { + "epoch": 1.0555908917440529, + "grad_norm": 1.4983429908752441, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8664616346359253, + "num_tokens": 316599090.0, + "step": 8298 + }, + { + "epoch": 1.0557181020226434, + "grad_norm": 1.5619103908538818, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.86667400598526, + "num_tokens": 316640818.0, + "step": 8299 + }, + { + "epoch": 1.055845312301234, + "grad_norm": 1.4341140985488892, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8776935935020447, + "num_tokens": 316681271.0, + "step": 8300 + }, + { + "epoch": 1.0559725225798244, + "grad_norm": 1.5676127672195435, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8813261985778809, + "num_tokens": 316715326.0, + "step": 8301 + }, + { + "epoch": 1.056099732858415, + "grad_norm": 1.5079511404037476, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8732120990753174, + "num_tokens": 316751733.0, + "step": 8302 + }, + { + "epoch": 1.0562269431370055, + "grad_norm": 1.562543272972107, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.874382734298706, + "num_tokens": 316788938.0, + "step": 8303 + }, + { + "epoch": 1.056354153415596, + "grad_norm": 1.5584183931350708, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8731791973114014, + "num_tokens": 316827342.0, + "step": 8304 + }, + { + "epoch": 1.0564813636941865, + "grad_norm": 1.5999761819839478, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8729265928268433, + "num_tokens": 316869699.0, + "step": 8305 + }, + { + "epoch": 1.056608573972777, + "grad_norm": 1.6538249254226685, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8759522438049316, + "num_tokens": 316900278.0, + "step": 8306 + }, + { + "epoch": 1.0567357842513676, + "grad_norm": 1.641114354133606, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8492233753204346, + "num_tokens": 316936999.0, + "step": 8307 + }, + { + "epoch": 1.0568629945299581, + "grad_norm": 1.4166202545166016, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8824995756149292, + "num_tokens": 316978179.0, + "step": 8308 + }, + { + "epoch": 1.0569902048085484, + "grad_norm": 1.5120346546173096, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8675495386123657, + "num_tokens": 317020384.0, + "step": 8309 + }, + { + "epoch": 1.057117415087139, + "grad_norm": 1.4469788074493408, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8754194378852844, + "num_tokens": 317061370.0, + "step": 8310 + }, + { + "epoch": 1.0572446253657295, + "grad_norm": 1.7651994228363037, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8683844804763794, + "num_tokens": 317093875.0, + "step": 8311 + }, + { + "epoch": 1.05737183564432, + "grad_norm": 1.580862045288086, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8745912313461304, + "num_tokens": 317128995.0, + "step": 8312 + }, + { + "epoch": 1.0574990459229106, + "grad_norm": 1.4485470056533813, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8769485950469971, + "num_tokens": 317167888.0, + "step": 8313 + }, + { + "epoch": 1.057626256201501, + "grad_norm": 1.4610670804977417, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8740517497062683, + "num_tokens": 317208874.0, + "step": 8314 + }, + { + "epoch": 1.0577534664800916, + "grad_norm": 1.6951096057891846, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8784347772598267, + "num_tokens": 317244495.0, + "step": 8315 + }, + { + "epoch": 1.0578806767586821, + "grad_norm": 1.5965830087661743, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8737709522247314, + "num_tokens": 317283187.0, + "step": 8316 + }, + { + "epoch": 1.0580078870372727, + "grad_norm": 1.5642907619476318, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8679295778274536, + "num_tokens": 317325922.0, + "step": 8317 + }, + { + "epoch": 1.0581350973158632, + "grad_norm": 1.5588188171386719, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8770354390144348, + "num_tokens": 317359769.0, + "step": 8318 + }, + { + "epoch": 1.0582623075944537, + "grad_norm": 1.5118300914764404, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8706109523773193, + "num_tokens": 317399328.0, + "step": 8319 + }, + { + "epoch": 1.058389517873044, + "grad_norm": 1.5123242139816284, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.881986141204834, + "num_tokens": 317437115.0, + "step": 8320 + }, + { + "epoch": 1.0585167281516346, + "grad_norm": 1.390386939048767, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.860723078250885, + "num_tokens": 317486264.0, + "step": 8321 + }, + { + "epoch": 1.058643938430225, + "grad_norm": 1.506812572479248, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8843017816543579, + "num_tokens": 317521608.0, + "step": 8322 + }, + { + "epoch": 1.0587711487088156, + "grad_norm": 1.4292985200881958, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.867709219455719, + "num_tokens": 317563831.0, + "step": 8323 + }, + { + "epoch": 1.0588983589874061, + "grad_norm": 1.5144312381744385, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8824328184127808, + "num_tokens": 317599599.0, + "step": 8324 + }, + { + "epoch": 1.0590255692659967, + "grad_norm": 1.7094347476959229, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8604927062988281, + "num_tokens": 317634225.0, + "step": 8325 + }, + { + "epoch": 1.0591527795445872, + "grad_norm": 1.610260248184204, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8686801195144653, + "num_tokens": 317672002.0, + "step": 8326 + }, + { + "epoch": 1.0592799898231777, + "grad_norm": 1.3109517097473145, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8742889761924744, + "num_tokens": 317716705.0, + "step": 8327 + }, + { + "epoch": 1.0594072001017683, + "grad_norm": 1.509553074836731, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8726932406425476, + "num_tokens": 317753343.0, + "step": 8328 + }, + { + "epoch": 1.0595344103803588, + "grad_norm": 1.5333975553512573, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8680541515350342, + "num_tokens": 317789932.0, + "step": 8329 + }, + { + "epoch": 1.0596616206589493, + "grad_norm": 1.5385684967041016, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8683633804321289, + "num_tokens": 317829984.0, + "step": 8330 + }, + { + "epoch": 1.0597888309375398, + "grad_norm": 1.505767822265625, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.883551836013794, + "num_tokens": 317869273.0, + "step": 8331 + }, + { + "epoch": 1.0599160412161304, + "grad_norm": 1.4923524856567383, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.877265989780426, + "num_tokens": 317907236.0, + "step": 8332 + }, + { + "epoch": 1.0600432514947207, + "grad_norm": 1.633834719657898, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8788812756538391, + "num_tokens": 317941719.0, + "step": 8333 + }, + { + "epoch": 1.0601704617733112, + "grad_norm": 1.5976258516311646, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8711605072021484, + "num_tokens": 317977538.0, + "step": 8334 + }, + { + "epoch": 1.0602976720519017, + "grad_norm": 1.4702602624893188, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8716504573822021, + "num_tokens": 318017362.0, + "step": 8335 + }, + { + "epoch": 1.0604248823304923, + "grad_norm": 1.4279240369796753, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8709659576416016, + "num_tokens": 318059943.0, + "step": 8336 + }, + { + "epoch": 1.0605520926090828, + "grad_norm": 1.4354902505874634, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8759963512420654, + "num_tokens": 318104200.0, + "step": 8337 + }, + { + "epoch": 1.0606793028876733, + "grad_norm": 1.4512726068496704, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8864465951919556, + "num_tokens": 318145457.0, + "step": 8338 + }, + { + "epoch": 1.0608065131662638, + "grad_norm": 1.483864426612854, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8728073835372925, + "num_tokens": 318183895.0, + "step": 8339 + }, + { + "epoch": 1.0609337234448544, + "grad_norm": 1.5581599473953247, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8800891637802124, + "num_tokens": 318220110.0, + "step": 8340 + }, + { + "epoch": 1.061060933723445, + "grad_norm": 1.4793567657470703, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8796591758728027, + "num_tokens": 318261995.0, + "step": 8341 + }, + { + "epoch": 1.0611881440020354, + "grad_norm": 1.7264052629470825, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8677458763122559, + "num_tokens": 318296435.0, + "step": 8342 + }, + { + "epoch": 1.061315354280626, + "grad_norm": 1.419600248336792, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8902021050453186, + "num_tokens": 318333211.0, + "step": 8343 + }, + { + "epoch": 1.0614425645592165, + "grad_norm": 1.4884823560714722, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8915280699729919, + "num_tokens": 318372357.0, + "step": 8344 + }, + { + "epoch": 1.0615697748378068, + "grad_norm": 1.554675579071045, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.890372633934021, + "num_tokens": 318406396.0, + "step": 8345 + }, + { + "epoch": 1.0616969851163973, + "grad_norm": 1.6231606006622314, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8618701696395874, + "num_tokens": 318445089.0, + "step": 8346 + }, + { + "epoch": 1.0618241953949878, + "grad_norm": 1.5756131410598755, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.878854513168335, + "num_tokens": 318481166.0, + "step": 8347 + }, + { + "epoch": 1.0619514056735784, + "grad_norm": 1.407064437866211, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8740983009338379, + "num_tokens": 318524184.0, + "step": 8348 + }, + { + "epoch": 1.062078615952169, + "grad_norm": 1.6064518690109253, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8660129308700562, + "num_tokens": 318561238.0, + "step": 8349 + }, + { + "epoch": 1.0622058262307594, + "grad_norm": 1.674949049949646, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.856234073638916, + "num_tokens": 318595507.0, + "step": 8350 + }, + { + "epoch": 1.06233303650935, + "grad_norm": 1.512650966644287, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8807587027549744, + "num_tokens": 318630811.0, + "step": 8351 + }, + { + "epoch": 1.0624602467879405, + "grad_norm": 1.5410643815994263, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8729149699211121, + "num_tokens": 318667935.0, + "step": 8352 + }, + { + "epoch": 1.062587457066531, + "grad_norm": 1.4866477251052856, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8850511312484741, + "num_tokens": 318704993.0, + "step": 8353 + }, + { + "epoch": 1.0627146673451215, + "grad_norm": 1.4942690134048462, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8631508946418762, + "num_tokens": 318741882.0, + "step": 8354 + }, + { + "epoch": 1.062841877623712, + "grad_norm": 1.403356909751892, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8813884854316711, + "num_tokens": 318778294.0, + "step": 8355 + }, + { + "epoch": 1.0629690879023026, + "grad_norm": 1.5788084268569946, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8643940091133118, + "num_tokens": 318812721.0, + "step": 8356 + }, + { + "epoch": 1.0630962981808931, + "grad_norm": 1.5795553922653198, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8768627047538757, + "num_tokens": 318850344.0, + "step": 8357 + }, + { + "epoch": 1.0632235084594834, + "grad_norm": 1.3643388748168945, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8844892978668213, + "num_tokens": 318891582.0, + "step": 8358 + }, + { + "epoch": 1.063350718738074, + "grad_norm": 1.429077386856079, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8795337677001953, + "num_tokens": 318932848.0, + "step": 8359 + }, + { + "epoch": 1.0634779290166645, + "grad_norm": 1.3616523742675781, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8754494786262512, + "num_tokens": 318975945.0, + "step": 8360 + }, + { + "epoch": 1.063605139295255, + "grad_norm": 1.3967554569244385, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8685995936393738, + "num_tokens": 319020530.0, + "step": 8361 + }, + { + "epoch": 1.0637323495738455, + "grad_norm": 1.5188666582107544, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8754406571388245, + "num_tokens": 319056953.0, + "step": 8362 + }, + { + "epoch": 1.063859559852436, + "grad_norm": 1.507796287536621, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8880693912506104, + "num_tokens": 319092355.0, + "step": 8363 + }, + { + "epoch": 1.0639867701310266, + "grad_norm": 3.6358230113983154, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8749493360519409, + "num_tokens": 319134236.0, + "step": 8364 + }, + { + "epoch": 1.0641139804096171, + "grad_norm": 1.517006516456604, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8860771059989929, + "num_tokens": 319174483.0, + "step": 8365 + }, + { + "epoch": 1.0642411906882077, + "grad_norm": 1.5153542757034302, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8851993083953857, + "num_tokens": 319212425.0, + "step": 8366 + }, + { + "epoch": 1.0643684009667982, + "grad_norm": 1.6118404865264893, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8714526295661926, + "num_tokens": 319247387.0, + "step": 8367 + }, + { + "epoch": 1.0644956112453887, + "grad_norm": 1.5939515829086304, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8697158098220825, + "num_tokens": 319288268.0, + "step": 8368 + }, + { + "epoch": 1.064622821523979, + "grad_norm": 1.5495634078979492, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8652561902999878, + "num_tokens": 319324809.0, + "step": 8369 + }, + { + "epoch": 1.0647500318025696, + "grad_norm": 1.4985854625701904, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8872414827346802, + "num_tokens": 319360060.0, + "step": 8370 + }, + { + "epoch": 1.06487724208116, + "grad_norm": 1.6035462617874146, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8729684948921204, + "num_tokens": 319394343.0, + "step": 8371 + }, + { + "epoch": 1.0650044523597506, + "grad_norm": 1.5429279804229736, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8562706708908081, + "num_tokens": 319434642.0, + "step": 8372 + }, + { + "epoch": 1.0651316626383411, + "grad_norm": 1.354219913482666, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8756470680236816, + "num_tokens": 319483461.0, + "step": 8373 + }, + { + "epoch": 1.0652588729169317, + "grad_norm": 1.5082340240478516, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.877254843711853, + "num_tokens": 319522408.0, + "step": 8374 + }, + { + "epoch": 1.0653860831955222, + "grad_norm": 1.4542003870010376, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8838614821434021, + "num_tokens": 319560572.0, + "step": 8375 + }, + { + "epoch": 1.0655132934741127, + "grad_norm": 1.5587342977523804, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8765312433242798, + "num_tokens": 319595995.0, + "step": 8376 + }, + { + "epoch": 1.0656405037527032, + "grad_norm": 1.4788285493850708, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8838993310928345, + "num_tokens": 319634023.0, + "step": 8377 + }, + { + "epoch": 1.0657677140312938, + "grad_norm": 1.4993053674697876, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8787034749984741, + "num_tokens": 319668376.0, + "step": 8378 + }, + { + "epoch": 1.0658949243098843, + "grad_norm": 1.4510058164596558, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8721575140953064, + "num_tokens": 319713346.0, + "step": 8379 + }, + { + "epoch": 1.0660221345884748, + "grad_norm": 1.4118351936340332, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8881512880325317, + "num_tokens": 319753128.0, + "step": 8380 + }, + { + "epoch": 1.0661493448670654, + "grad_norm": 1.595991849899292, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8782167434692383, + "num_tokens": 319789592.0, + "step": 8381 + }, + { + "epoch": 1.0662765551456557, + "grad_norm": 1.5695140361785889, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8782863616943359, + "num_tokens": 319823200.0, + "step": 8382 + }, + { + "epoch": 1.0664037654242462, + "grad_norm": 1.4273781776428223, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8749469518661499, + "num_tokens": 319867290.0, + "step": 8383 + }, + { + "epoch": 1.0665309757028367, + "grad_norm": 1.5866920948028564, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8758387565612793, + "num_tokens": 319906234.0, + "step": 8384 + }, + { + "epoch": 1.0666581859814273, + "grad_norm": 1.7257081270217896, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8552654981613159, + "num_tokens": 319941585.0, + "step": 8385 + }, + { + "epoch": 1.0667853962600178, + "grad_norm": 1.57749342918396, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.871344804763794, + "num_tokens": 319978241.0, + "step": 8386 + }, + { + "epoch": 1.0669126065386083, + "grad_norm": 1.4899762868881226, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8765277862548828, + "num_tokens": 320018648.0, + "step": 8387 + }, + { + "epoch": 1.0670398168171988, + "grad_norm": 1.410016417503357, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8834991455078125, + "num_tokens": 320059186.0, + "step": 8388 + }, + { + "epoch": 1.0671670270957894, + "grad_norm": 1.4354513883590698, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8866484761238098, + "num_tokens": 320095452.0, + "step": 8389 + }, + { + "epoch": 1.06729423737438, + "grad_norm": 1.4135408401489258, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8774529099464417, + "num_tokens": 320138307.0, + "step": 8390 + }, + { + "epoch": 1.0674214476529704, + "grad_norm": 1.4675312042236328, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8832836151123047, + "num_tokens": 320177773.0, + "step": 8391 + }, + { + "epoch": 1.067548657931561, + "grad_norm": 1.4684735536575317, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8753239512443542, + "num_tokens": 320213742.0, + "step": 8392 + }, + { + "epoch": 1.0676758682101515, + "grad_norm": 1.5002336502075195, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8893047571182251, + "num_tokens": 320248895.0, + "step": 8393 + }, + { + "epoch": 1.0678030784887418, + "grad_norm": 1.4881227016448975, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8780431747436523, + "num_tokens": 320288601.0, + "step": 8394 + }, + { + "epoch": 1.0679302887673323, + "grad_norm": 1.5018230676651, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.875031590461731, + "num_tokens": 320326638.0, + "step": 8395 + }, + { + "epoch": 1.0680574990459228, + "grad_norm": 1.5759559869766235, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8660644292831421, + "num_tokens": 320364802.0, + "step": 8396 + }, + { + "epoch": 1.0681847093245134, + "grad_norm": 1.5664674043655396, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.876110851764679, + "num_tokens": 320401602.0, + "step": 8397 + }, + { + "epoch": 1.068311919603104, + "grad_norm": 1.53563392162323, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8756850361824036, + "num_tokens": 320438651.0, + "step": 8398 + }, + { + "epoch": 1.0684391298816944, + "grad_norm": 1.571003794670105, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8757741451263428, + "num_tokens": 320477104.0, + "step": 8399 + }, + { + "epoch": 1.068566340160285, + "grad_norm": 1.4331468343734741, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8833810091018677, + "num_tokens": 320517421.0, + "step": 8400 + }, + { + "epoch": 1.0686935504388755, + "grad_norm": 1.3991317749023438, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8799320459365845, + "num_tokens": 320559708.0, + "step": 8401 + }, + { + "epoch": 1.068820760717466, + "grad_norm": 1.4894615411758423, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8836843967437744, + "num_tokens": 320594597.0, + "step": 8402 + }, + { + "epoch": 1.0689479709960565, + "grad_norm": 1.579099178314209, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8879561424255371, + "num_tokens": 320632649.0, + "step": 8403 + }, + { + "epoch": 1.069075181274647, + "grad_norm": 1.752940058708191, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8698980212211609, + "num_tokens": 320665699.0, + "step": 8404 + }, + { + "epoch": 1.0692023915532376, + "grad_norm": 1.407675862312317, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8713279962539673, + "num_tokens": 320706523.0, + "step": 8405 + }, + { + "epoch": 1.0693296018318281, + "grad_norm": 1.6339939832687378, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8633030652999878, + "num_tokens": 320741665.0, + "step": 8406 + }, + { + "epoch": 1.0694568121104184, + "grad_norm": 1.6470105648040771, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8804733157157898, + "num_tokens": 320774606.0, + "step": 8407 + }, + { + "epoch": 1.069584022389009, + "grad_norm": 1.651604413986206, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8515880107879639, + "num_tokens": 320811879.0, + "step": 8408 + }, + { + "epoch": 1.0697112326675995, + "grad_norm": 1.6976830959320068, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8662799000740051, + "num_tokens": 320848472.0, + "step": 8409 + }, + { + "epoch": 1.06983844294619, + "grad_norm": 1.6972155570983887, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8575089573860168, + "num_tokens": 320883938.0, + "step": 8410 + }, + { + "epoch": 1.0699656532247805, + "grad_norm": 1.5216277837753296, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8834954500198364, + "num_tokens": 320919438.0, + "step": 8411 + }, + { + "epoch": 1.070092863503371, + "grad_norm": 1.4766864776611328, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8661603331565857, + "num_tokens": 320961336.0, + "step": 8412 + }, + { + "epoch": 1.0702200737819616, + "grad_norm": 1.6283918619155884, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8728846311569214, + "num_tokens": 320992756.0, + "step": 8413 + }, + { + "epoch": 1.0703472840605521, + "grad_norm": 1.395709753036499, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8734310865402222, + "num_tokens": 321036399.0, + "step": 8414 + }, + { + "epoch": 1.0704744943391427, + "grad_norm": 1.619728684425354, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8698381185531616, + "num_tokens": 321068568.0, + "step": 8415 + }, + { + "epoch": 1.0706017046177332, + "grad_norm": 1.522721767425537, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8731328248977661, + "num_tokens": 321108295.0, + "step": 8416 + }, + { + "epoch": 1.0707289148963237, + "grad_norm": 1.657006025314331, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8763499855995178, + "num_tokens": 321143766.0, + "step": 8417 + }, + { + "epoch": 1.070856125174914, + "grad_norm": 1.364560842514038, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8841425180435181, + "num_tokens": 321186188.0, + "step": 8418 + }, + { + "epoch": 1.0709833354535045, + "grad_norm": 1.7508536577224731, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8814801573753357, + "num_tokens": 321218079.0, + "step": 8419 + }, + { + "epoch": 1.071110545732095, + "grad_norm": 1.5096349716186523, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.879457950592041, + "num_tokens": 321254211.0, + "step": 8420 + }, + { + "epoch": 1.0712377560106856, + "grad_norm": 1.5775480270385742, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8784012794494629, + "num_tokens": 321291145.0, + "step": 8421 + }, + { + "epoch": 1.0713649662892761, + "grad_norm": 1.5235720872879028, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8762423396110535, + "num_tokens": 321326588.0, + "step": 8422 + }, + { + "epoch": 1.0714921765678667, + "grad_norm": 1.6431173086166382, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8808696866035461, + "num_tokens": 321363641.0, + "step": 8423 + }, + { + "epoch": 1.0716193868464572, + "grad_norm": 1.710904598236084, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8527562022209167, + "num_tokens": 321394508.0, + "step": 8424 + }, + { + "epoch": 1.0717465971250477, + "grad_norm": 1.5764098167419434, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8614306449890137, + "num_tokens": 321434881.0, + "step": 8425 + }, + { + "epoch": 1.0718738074036382, + "grad_norm": 1.4628838300704956, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8868390321731567, + "num_tokens": 321471676.0, + "step": 8426 + }, + { + "epoch": 1.0720010176822288, + "grad_norm": 1.3915574550628662, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8911631107330322, + "num_tokens": 321512647.0, + "step": 8427 + }, + { + "epoch": 1.0721282279608193, + "grad_norm": 1.3704721927642822, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8944616913795471, + "num_tokens": 321556048.0, + "step": 8428 + }, + { + "epoch": 1.0722554382394098, + "grad_norm": 1.5674254894256592, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8776465654373169, + "num_tokens": 321592420.0, + "step": 8429 + }, + { + "epoch": 1.0723826485180004, + "grad_norm": 1.607148289680481, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8823171257972717, + "num_tokens": 321627625.0, + "step": 8430 + }, + { + "epoch": 1.0725098587965907, + "grad_norm": 1.5914738178253174, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8703928589820862, + "num_tokens": 321667794.0, + "step": 8431 + }, + { + "epoch": 1.0726370690751812, + "grad_norm": 1.4827880859375, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8791051506996155, + "num_tokens": 321705773.0, + "step": 8432 + }, + { + "epoch": 1.0727642793537717, + "grad_norm": 1.5148475170135498, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8879654407501221, + "num_tokens": 321743340.0, + "step": 8433 + }, + { + "epoch": 1.0728914896323622, + "grad_norm": 1.4284532070159912, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8808572292327881, + "num_tokens": 321785055.0, + "step": 8434 + }, + { + "epoch": 1.0730186999109528, + "grad_norm": 1.4130476713180542, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8893463611602783, + "num_tokens": 321822470.0, + "step": 8435 + }, + { + "epoch": 1.0731459101895433, + "grad_norm": 1.5425087213516235, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8809683322906494, + "num_tokens": 321861723.0, + "step": 8436 + }, + { + "epoch": 1.0732731204681338, + "grad_norm": 1.4727882146835327, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8855212330818176, + "num_tokens": 321902409.0, + "step": 8437 + }, + { + "epoch": 1.0734003307467244, + "grad_norm": 1.5205198526382446, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8741086721420288, + "num_tokens": 321941813.0, + "step": 8438 + }, + { + "epoch": 1.073527541025315, + "grad_norm": 1.6078394651412964, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8592861890792847, + "num_tokens": 321980747.0, + "step": 8439 + }, + { + "epoch": 1.0736547513039054, + "grad_norm": 1.5350215435028076, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8671635389328003, + "num_tokens": 322021149.0, + "step": 8440 + }, + { + "epoch": 1.073781961582496, + "grad_norm": 1.5395673513412476, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.867526650428772, + "num_tokens": 322062709.0, + "step": 8441 + }, + { + "epoch": 1.0739091718610865, + "grad_norm": 1.4575058221817017, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8873724341392517, + "num_tokens": 322104823.0, + "step": 8442 + }, + { + "epoch": 1.0740363821396768, + "grad_norm": 1.534029483795166, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8802856206893921, + "num_tokens": 322140299.0, + "step": 8443 + }, + { + "epoch": 1.0741635924182673, + "grad_norm": 1.5135126113891602, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8861677050590515, + "num_tokens": 322178922.0, + "step": 8444 + }, + { + "epoch": 1.0742908026968578, + "grad_norm": 1.533402681350708, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8668460249900818, + "num_tokens": 322218552.0, + "step": 8445 + }, + { + "epoch": 1.0744180129754484, + "grad_norm": 1.7483423948287964, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.868486762046814, + "num_tokens": 322250593.0, + "step": 8446 + }, + { + "epoch": 1.074545223254039, + "grad_norm": 1.5378774404525757, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8875682353973389, + "num_tokens": 322289652.0, + "step": 8447 + }, + { + "epoch": 1.0746724335326294, + "grad_norm": 1.4317482709884644, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8852392435073853, + "num_tokens": 322330377.0, + "step": 8448 + }, + { + "epoch": 1.07479964381122, + "grad_norm": 1.4897671937942505, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8845263719558716, + "num_tokens": 322364164.0, + "step": 8449 + }, + { + "epoch": 1.0749268540898105, + "grad_norm": 1.4677051305770874, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8698791861534119, + "num_tokens": 322405633.0, + "step": 8450 + }, + { + "epoch": 1.075054064368401, + "grad_norm": 1.5846298933029175, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8763220310211182, + "num_tokens": 322439606.0, + "step": 8451 + }, + { + "epoch": 1.0751812746469915, + "grad_norm": 1.7253867387771606, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8761391043663025, + "num_tokens": 322470680.0, + "step": 8452 + }, + { + "epoch": 1.075308484925582, + "grad_norm": 1.5883549451828003, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8781102895736694, + "num_tokens": 322503645.0, + "step": 8453 + }, + { + "epoch": 1.0754356952041726, + "grad_norm": 1.5780450105667114, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8746622204780579, + "num_tokens": 322541531.0, + "step": 8454 + }, + { + "epoch": 1.0755629054827631, + "grad_norm": 1.477859377861023, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8661594986915588, + "num_tokens": 322584067.0, + "step": 8455 + }, + { + "epoch": 1.0756901157613534, + "grad_norm": 1.4108656644821167, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8932048082351685, + "num_tokens": 322621291.0, + "step": 8456 + }, + { + "epoch": 1.075817326039944, + "grad_norm": 1.6510251760482788, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.87120121717453, + "num_tokens": 322658460.0, + "step": 8457 + }, + { + "epoch": 1.0759445363185345, + "grad_norm": 1.512726068496704, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8695661425590515, + "num_tokens": 322697402.0, + "step": 8458 + }, + { + "epoch": 1.076071746597125, + "grad_norm": 1.5502489805221558, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8771581649780273, + "num_tokens": 322733625.0, + "step": 8459 + }, + { + "epoch": 1.0761989568757155, + "grad_norm": 1.3828283548355103, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8822224140167236, + "num_tokens": 322779446.0, + "step": 8460 + }, + { + "epoch": 1.076326167154306, + "grad_norm": 1.3972455263137817, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8829755187034607, + "num_tokens": 322819460.0, + "step": 8461 + }, + { + "epoch": 1.0764533774328966, + "grad_norm": 1.5240821838378906, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8771781325340271, + "num_tokens": 322859195.0, + "step": 8462 + }, + { + "epoch": 1.0765805877114871, + "grad_norm": 1.612903118133545, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8801090121269226, + "num_tokens": 322900624.0, + "step": 8463 + }, + { + "epoch": 1.0767077979900777, + "grad_norm": 1.5139840841293335, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8712049126625061, + "num_tokens": 322940690.0, + "step": 8464 + }, + { + "epoch": 1.0768350082686682, + "grad_norm": 1.6722298860549927, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.875708281993866, + "num_tokens": 322972659.0, + "step": 8465 + }, + { + "epoch": 1.0769622185472587, + "grad_norm": 1.5399430990219116, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8861374855041504, + "num_tokens": 323006943.0, + "step": 8466 + }, + { + "epoch": 1.077089428825849, + "grad_norm": 1.4212112426757812, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8869737386703491, + "num_tokens": 323047415.0, + "step": 8467 + }, + { + "epoch": 1.0772166391044395, + "grad_norm": 1.7147250175476074, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8626772165298462, + "num_tokens": 323084940.0, + "step": 8468 + }, + { + "epoch": 1.07734384938303, + "grad_norm": 1.4704030752182007, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8725796341896057, + "num_tokens": 323122946.0, + "step": 8469 + }, + { + "epoch": 1.0774710596616206, + "grad_norm": 1.5000473260879517, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8718420267105103, + "num_tokens": 323161719.0, + "step": 8470 + }, + { + "epoch": 1.0775982699402111, + "grad_norm": 1.446217656135559, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8748949766159058, + "num_tokens": 323202188.0, + "step": 8471 + }, + { + "epoch": 1.0777254802188017, + "grad_norm": 1.52599036693573, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8779774904251099, + "num_tokens": 323237414.0, + "step": 8472 + }, + { + "epoch": 1.0778526904973922, + "grad_norm": 1.447282314300537, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.878302276134491, + "num_tokens": 323275120.0, + "step": 8473 + }, + { + "epoch": 1.0779799007759827, + "grad_norm": 1.5754337310791016, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.867225706577301, + "num_tokens": 323313484.0, + "step": 8474 + }, + { + "epoch": 1.0781071110545732, + "grad_norm": 1.544678807258606, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.880183219909668, + "num_tokens": 323348349.0, + "step": 8475 + }, + { + "epoch": 1.0782343213331638, + "grad_norm": 1.4771476984024048, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8782331347465515, + "num_tokens": 323387409.0, + "step": 8476 + }, + { + "epoch": 1.0783615316117543, + "grad_norm": 1.4901847839355469, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8621774315834045, + "num_tokens": 323428992.0, + "step": 8477 + }, + { + "epoch": 1.0784887418903448, + "grad_norm": 1.4947104454040527, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.884285569190979, + "num_tokens": 323465467.0, + "step": 8478 + }, + { + "epoch": 1.0786159521689354, + "grad_norm": 1.6148526668548584, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8587573170661926, + "num_tokens": 323501526.0, + "step": 8479 + }, + { + "epoch": 1.0787431624475257, + "grad_norm": 1.5069202184677124, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8788561820983887, + "num_tokens": 323540498.0, + "step": 8480 + }, + { + "epoch": 1.0788703727261162, + "grad_norm": 1.7682615518569946, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8511642217636108, + "num_tokens": 323577076.0, + "step": 8481 + }, + { + "epoch": 1.0789975830047067, + "grad_norm": 1.5578136444091797, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8710638284683228, + "num_tokens": 323613822.0, + "step": 8482 + }, + { + "epoch": 1.0791247932832972, + "grad_norm": 1.5229625701904297, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8719240427017212, + "num_tokens": 323650873.0, + "step": 8483 + }, + { + "epoch": 1.0792520035618878, + "grad_norm": 1.3531452417373657, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8945814371109009, + "num_tokens": 323689970.0, + "step": 8484 + }, + { + "epoch": 1.0793792138404783, + "grad_norm": 1.5915769338607788, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8715677261352539, + "num_tokens": 323726168.0, + "step": 8485 + }, + { + "epoch": 1.0795064241190688, + "grad_norm": 1.4953341484069824, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8882084488868713, + "num_tokens": 323768106.0, + "step": 8486 + }, + { + "epoch": 1.0796336343976594, + "grad_norm": 1.4368516206741333, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8588927984237671, + "num_tokens": 323811064.0, + "step": 8487 + }, + { + "epoch": 1.0797608446762499, + "grad_norm": 1.4507817029953003, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8799331188201904, + "num_tokens": 323849986.0, + "step": 8488 + }, + { + "epoch": 1.0798880549548404, + "grad_norm": 1.441382884979248, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8685824871063232, + "num_tokens": 323895033.0, + "step": 8489 + }, + { + "epoch": 1.080015265233431, + "grad_norm": 1.4534859657287598, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8778460025787354, + "num_tokens": 323935658.0, + "step": 8490 + }, + { + "epoch": 1.0801424755120215, + "grad_norm": 1.53384530544281, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8728835582733154, + "num_tokens": 323972113.0, + "step": 8491 + }, + { + "epoch": 1.0802696857906118, + "grad_norm": 1.4607189893722534, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8775966167449951, + "num_tokens": 324011553.0, + "step": 8492 + }, + { + "epoch": 1.0803968960692023, + "grad_norm": 1.591447353363037, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.862250566482544, + "num_tokens": 324046841.0, + "step": 8493 + }, + { + "epoch": 1.0805241063477928, + "grad_norm": 1.55753755569458, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8754271864891052, + "num_tokens": 324085854.0, + "step": 8494 + }, + { + "epoch": 1.0806513166263834, + "grad_norm": 1.5756746530532837, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8657689690589905, + "num_tokens": 324122589.0, + "step": 8495 + }, + { + "epoch": 1.080778526904974, + "grad_norm": 1.6247979402542114, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8741990327835083, + "num_tokens": 324158876.0, + "step": 8496 + }, + { + "epoch": 1.0809057371835644, + "grad_norm": 1.4580937623977661, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8855534195899963, + "num_tokens": 324198989.0, + "step": 8497 + }, + { + "epoch": 1.081032947462155, + "grad_norm": 1.4818605184555054, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8746204376220703, + "num_tokens": 324241193.0, + "step": 8498 + }, + { + "epoch": 1.0811601577407455, + "grad_norm": 1.5184698104858398, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8778054714202881, + "num_tokens": 324278945.0, + "step": 8499 + }, + { + "epoch": 1.081287368019336, + "grad_norm": 1.5166465044021606, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8664565086364746, + "num_tokens": 324318823.0, + "step": 8500 + }, + { + "epoch": 1.0814145782979265, + "grad_norm": 1.508941650390625, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8714070320129395, + "num_tokens": 324355563.0, + "step": 8501 + }, + { + "epoch": 1.081541788576517, + "grad_norm": 1.6479473114013672, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8765726089477539, + "num_tokens": 324387888.0, + "step": 8502 + }, + { + "epoch": 1.0816689988551076, + "grad_norm": 1.5476388931274414, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8734961748123169, + "num_tokens": 324425240.0, + "step": 8503 + }, + { + "epoch": 1.0817962091336981, + "grad_norm": 1.646807074546814, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8617366552352905, + "num_tokens": 324460371.0, + "step": 8504 + }, + { + "epoch": 1.0819234194122884, + "grad_norm": 1.4819309711456299, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8600276112556458, + "num_tokens": 324500277.0, + "step": 8505 + }, + { + "epoch": 1.082050629690879, + "grad_norm": 1.4085988998413086, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8858731985092163, + "num_tokens": 324541013.0, + "step": 8506 + }, + { + "epoch": 1.0821778399694695, + "grad_norm": 1.7349557876586914, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8590347170829773, + "num_tokens": 324574967.0, + "step": 8507 + }, + { + "epoch": 1.08230505024806, + "grad_norm": 1.5071035623550415, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8842167854309082, + "num_tokens": 324612413.0, + "step": 8508 + }, + { + "epoch": 1.0824322605266505, + "grad_norm": 1.6694774627685547, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8717190027236938, + "num_tokens": 324648369.0, + "step": 8509 + }, + { + "epoch": 1.082559470805241, + "grad_norm": 1.4828317165374756, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8782691955566406, + "num_tokens": 324689056.0, + "step": 8510 + }, + { + "epoch": 1.0826866810838316, + "grad_norm": 1.644860863685608, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8804718255996704, + "num_tokens": 324722207.0, + "step": 8511 + }, + { + "epoch": 1.0828138913624221, + "grad_norm": 1.4749647378921509, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8566805124282837, + "num_tokens": 324769491.0, + "step": 8512 + }, + { + "epoch": 1.0829411016410126, + "grad_norm": 1.3655812740325928, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8853539824485779, + "num_tokens": 324813063.0, + "step": 8513 + }, + { + "epoch": 1.0830683119196032, + "grad_norm": 1.6922006607055664, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8826180696487427, + "num_tokens": 324843860.0, + "step": 8514 + }, + { + "epoch": 1.0831955221981937, + "grad_norm": 1.3868948221206665, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8684020638465881, + "num_tokens": 324886900.0, + "step": 8515 + }, + { + "epoch": 1.083322732476784, + "grad_norm": 1.509324550628662, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8733910322189331, + "num_tokens": 324923970.0, + "step": 8516 + }, + { + "epoch": 1.0834499427553745, + "grad_norm": 1.5238587856292725, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.864789605140686, + "num_tokens": 324961880.0, + "step": 8517 + }, + { + "epoch": 1.083577153033965, + "grad_norm": 1.4954663515090942, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8736761212348938, + "num_tokens": 325001315.0, + "step": 8518 + }, + { + "epoch": 1.0837043633125556, + "grad_norm": 1.6292095184326172, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8746704459190369, + "num_tokens": 325034874.0, + "step": 8519 + }, + { + "epoch": 1.0838315735911461, + "grad_norm": 1.5220825672149658, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8693078756332397, + "num_tokens": 325072767.0, + "step": 8520 + }, + { + "epoch": 1.0839587838697367, + "grad_norm": 1.4974730014801025, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8882802724838257, + "num_tokens": 325108986.0, + "step": 8521 + }, + { + "epoch": 1.0840859941483272, + "grad_norm": 1.456169605255127, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8905366659164429, + "num_tokens": 325147102.0, + "step": 8522 + }, + { + "epoch": 1.0842132044269177, + "grad_norm": 1.5715991258621216, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8654675483703613, + "num_tokens": 325184817.0, + "step": 8523 + }, + { + "epoch": 1.0843404147055082, + "grad_norm": 1.4215866327285767, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8883048295974731, + "num_tokens": 325223276.0, + "step": 8524 + }, + { + "epoch": 1.0844676249840988, + "grad_norm": 1.599644660949707, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8820405006408691, + "num_tokens": 325261152.0, + "step": 8525 + }, + { + "epoch": 1.0845948352626893, + "grad_norm": 1.5197231769561768, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8835699558258057, + "num_tokens": 325298007.0, + "step": 8526 + }, + { + "epoch": 1.0847220455412798, + "grad_norm": 1.6433807611465454, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8766589164733887, + "num_tokens": 325331915.0, + "step": 8527 + }, + { + "epoch": 1.0848492558198704, + "grad_norm": 1.6162594556808472, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8744564056396484, + "num_tokens": 325365786.0, + "step": 8528 + }, + { + "epoch": 1.0849764660984607, + "grad_norm": 1.6703743934631348, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8870561122894287, + "num_tokens": 325400704.0, + "step": 8529 + }, + { + "epoch": 1.0851036763770512, + "grad_norm": 1.489151120185852, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8860222101211548, + "num_tokens": 325440293.0, + "step": 8530 + }, + { + "epoch": 1.0852308866556417, + "grad_norm": 1.457360863685608, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8809161186218262, + "num_tokens": 325481031.0, + "step": 8531 + }, + { + "epoch": 1.0853580969342322, + "grad_norm": 1.6386419534683228, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.870904803276062, + "num_tokens": 325514761.0, + "step": 8532 + }, + { + "epoch": 1.0854853072128228, + "grad_norm": 1.5674841403961182, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8607932329177856, + "num_tokens": 325555592.0, + "step": 8533 + }, + { + "epoch": 1.0856125174914133, + "grad_norm": 1.5775656700134277, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.875027596950531, + "num_tokens": 325590721.0, + "step": 8534 + }, + { + "epoch": 1.0857397277700038, + "grad_norm": 1.4421643018722534, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8874425292015076, + "num_tokens": 325633622.0, + "step": 8535 + }, + { + "epoch": 1.0858669380485944, + "grad_norm": 1.6404767036437988, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8773845434188843, + "num_tokens": 325668660.0, + "step": 8536 + }, + { + "epoch": 1.0859941483271849, + "grad_norm": 1.5131455659866333, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.885515034198761, + "num_tokens": 325705835.0, + "step": 8537 + }, + { + "epoch": 1.0861213586057754, + "grad_norm": 1.4077790975570679, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8747403621673584, + "num_tokens": 325744310.0, + "step": 8538 + }, + { + "epoch": 1.086248568884366, + "grad_norm": 1.5760743618011475, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8595591187477112, + "num_tokens": 325783249.0, + "step": 8539 + }, + { + "epoch": 1.0863757791629565, + "grad_norm": 1.4712684154510498, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8742173910140991, + "num_tokens": 325823786.0, + "step": 8540 + }, + { + "epoch": 1.0865029894415468, + "grad_norm": 1.511040449142456, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.882335901260376, + "num_tokens": 325860042.0, + "step": 8541 + }, + { + "epoch": 1.0866301997201373, + "grad_norm": 1.4461749792099, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.875740647315979, + "num_tokens": 325900822.0, + "step": 8542 + }, + { + "epoch": 1.0867574099987278, + "grad_norm": 1.4550012350082397, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8796660304069519, + "num_tokens": 325941601.0, + "step": 8543 + }, + { + "epoch": 1.0868846202773184, + "grad_norm": 1.6068164110183716, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8755195140838623, + "num_tokens": 325977189.0, + "step": 8544 + }, + { + "epoch": 1.0870118305559089, + "grad_norm": 1.4524332284927368, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.890238881111145, + "num_tokens": 326016083.0, + "step": 8545 + }, + { + "epoch": 1.0871390408344994, + "grad_norm": 1.5849689245224, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8518772125244141, + "num_tokens": 326054768.0, + "step": 8546 + }, + { + "epoch": 1.08726625111309, + "grad_norm": 1.4685134887695312, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8574860095977783, + "num_tokens": 326094827.0, + "step": 8547 + }, + { + "epoch": 1.0873934613916805, + "grad_norm": 1.608293890953064, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8732448816299438, + "num_tokens": 326128472.0, + "step": 8548 + }, + { + "epoch": 1.087520671670271, + "grad_norm": 1.5867637395858765, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8627440333366394, + "num_tokens": 326168156.0, + "step": 8549 + }, + { + "epoch": 1.0876478819488615, + "grad_norm": 1.6730867624282837, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8580647706985474, + "num_tokens": 326207707.0, + "step": 8550 + }, + { + "epoch": 1.087775092227452, + "grad_norm": 1.6722486019134521, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8585355281829834, + "num_tokens": 326244956.0, + "step": 8551 + }, + { + "epoch": 1.0879023025060426, + "grad_norm": 1.7788076400756836, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8611617088317871, + "num_tokens": 326275851.0, + "step": 8552 + }, + { + "epoch": 1.0880295127846331, + "grad_norm": 1.5204156637191772, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8753359317779541, + "num_tokens": 326314745.0, + "step": 8553 + }, + { + "epoch": 1.0881567230632234, + "grad_norm": 1.5098565816879272, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8843817710876465, + "num_tokens": 326357361.0, + "step": 8554 + }, + { + "epoch": 1.088283933341814, + "grad_norm": 1.653016209602356, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8814632892608643, + "num_tokens": 326395060.0, + "step": 8555 + }, + { + "epoch": 1.0884111436204045, + "grad_norm": 1.5772219896316528, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8840807676315308, + "num_tokens": 326429678.0, + "step": 8556 + }, + { + "epoch": 1.088538353898995, + "grad_norm": 1.2995095252990723, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8686900734901428, + "num_tokens": 326481634.0, + "step": 8557 + }, + { + "epoch": 1.0886655641775855, + "grad_norm": 1.4137135744094849, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8739439249038696, + "num_tokens": 326525837.0, + "step": 8558 + }, + { + "epoch": 1.088792774456176, + "grad_norm": 1.5755126476287842, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8490686416625977, + "num_tokens": 326565274.0, + "step": 8559 + }, + { + "epoch": 1.0889199847347666, + "grad_norm": 1.5594696998596191, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8674080967903137, + "num_tokens": 326603075.0, + "step": 8560 + }, + { + "epoch": 1.0890471950133571, + "grad_norm": 1.4619847536087036, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8899219036102295, + "num_tokens": 326640864.0, + "step": 8561 + }, + { + "epoch": 1.0891744052919476, + "grad_norm": 1.4765336513519287, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8678435683250427, + "num_tokens": 326681824.0, + "step": 8562 + }, + { + "epoch": 1.0893016155705382, + "grad_norm": 1.567276120185852, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8773091435432434, + "num_tokens": 326720929.0, + "step": 8563 + }, + { + "epoch": 1.0894288258491287, + "grad_norm": 1.4806777238845825, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8810309767723083, + "num_tokens": 326758138.0, + "step": 8564 + }, + { + "epoch": 1.089556036127719, + "grad_norm": 1.4871619939804077, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8522665500640869, + "num_tokens": 326803223.0, + "step": 8565 + }, + { + "epoch": 1.0896832464063095, + "grad_norm": 1.4561196565628052, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8651352524757385, + "num_tokens": 326847580.0, + "step": 8566 + }, + { + "epoch": 1.0898104566849, + "grad_norm": 1.6508907079696655, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8813239336013794, + "num_tokens": 326881179.0, + "step": 8567 + }, + { + "epoch": 1.0899376669634906, + "grad_norm": 1.5433850288391113, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8613362312316895, + "num_tokens": 326919763.0, + "step": 8568 + }, + { + "epoch": 1.0900648772420811, + "grad_norm": 1.5707288980484009, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8614348769187927, + "num_tokens": 326962631.0, + "step": 8569 + }, + { + "epoch": 1.0901920875206716, + "grad_norm": 1.7395116090774536, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8603423833847046, + "num_tokens": 326998156.0, + "step": 8570 + }, + { + "epoch": 1.0903192977992622, + "grad_norm": 1.5736929178237915, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8721590042114258, + "num_tokens": 327036231.0, + "step": 8571 + }, + { + "epoch": 1.0904465080778527, + "grad_norm": 1.4520310163497925, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8732625246047974, + "num_tokens": 327076594.0, + "step": 8572 + }, + { + "epoch": 1.0905737183564432, + "grad_norm": 1.3808200359344482, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8814961910247803, + "num_tokens": 327121829.0, + "step": 8573 + }, + { + "epoch": 1.0907009286350338, + "grad_norm": 1.475562334060669, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8852244019508362, + "num_tokens": 327157649.0, + "step": 8574 + }, + { + "epoch": 1.0908281389136243, + "grad_norm": 1.5723150968551636, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8760858178138733, + "num_tokens": 327192209.0, + "step": 8575 + }, + { + "epoch": 1.0909553491922148, + "grad_norm": 1.5565248727798462, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8776735067367554, + "num_tokens": 327230366.0, + "step": 8576 + }, + { + "epoch": 1.0910825594708053, + "grad_norm": 1.5670573711395264, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8829407691955566, + "num_tokens": 327266040.0, + "step": 8577 + }, + { + "epoch": 1.0912097697493957, + "grad_norm": 1.4230659008026123, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8832786679267883, + "num_tokens": 327303216.0, + "step": 8578 + }, + { + "epoch": 1.0913369800279862, + "grad_norm": 1.396058201789856, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8898105621337891, + "num_tokens": 327339152.0, + "step": 8579 + }, + { + "epoch": 1.0914641903065767, + "grad_norm": 1.622939109802246, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8715711832046509, + "num_tokens": 327371868.0, + "step": 8580 + }, + { + "epoch": 1.0915914005851672, + "grad_norm": 1.5693490505218506, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8658758997917175, + "num_tokens": 327415518.0, + "step": 8581 + }, + { + "epoch": 1.0917186108637578, + "grad_norm": 1.411939263343811, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8796261548995972, + "num_tokens": 327458015.0, + "step": 8582 + }, + { + "epoch": 1.0918458211423483, + "grad_norm": 1.514897108078003, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8831570148468018, + "num_tokens": 327494782.0, + "step": 8583 + }, + { + "epoch": 1.0919730314209388, + "grad_norm": 1.4960482120513916, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.868781328201294, + "num_tokens": 327537124.0, + "step": 8584 + }, + { + "epoch": 1.0921002416995294, + "grad_norm": 1.5409150123596191, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8849813342094421, + "num_tokens": 327570307.0, + "step": 8585 + }, + { + "epoch": 1.0922274519781199, + "grad_norm": 1.5644872188568115, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.876667857170105, + "num_tokens": 327607456.0, + "step": 8586 + }, + { + "epoch": 1.0923546622567104, + "grad_norm": 1.5506566762924194, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8808712959289551, + "num_tokens": 327643523.0, + "step": 8587 + }, + { + "epoch": 1.092481872535301, + "grad_norm": 1.465980887413025, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8778280019760132, + "num_tokens": 327684407.0, + "step": 8588 + }, + { + "epoch": 1.0926090828138915, + "grad_norm": 1.5660079717636108, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8819847106933594, + "num_tokens": 327721322.0, + "step": 8589 + }, + { + "epoch": 1.0927362930924818, + "grad_norm": 1.5770894289016724, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8735392093658447, + "num_tokens": 327755816.0, + "step": 8590 + }, + { + "epoch": 1.0928635033710723, + "grad_norm": 1.516373872756958, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8707212209701538, + "num_tokens": 327792552.0, + "step": 8591 + }, + { + "epoch": 1.0929907136496628, + "grad_norm": 1.4953593015670776, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8781110048294067, + "num_tokens": 327828666.0, + "step": 8592 + }, + { + "epoch": 1.0931179239282534, + "grad_norm": 1.5791239738464355, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8663685917854309, + "num_tokens": 327868424.0, + "step": 8593 + }, + { + "epoch": 1.0932451342068439, + "grad_norm": 1.5924029350280762, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8798599243164062, + "num_tokens": 327905218.0, + "step": 8594 + }, + { + "epoch": 1.0933723444854344, + "grad_norm": 1.5277949571609497, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8879647850990295, + "num_tokens": 327943397.0, + "step": 8595 + }, + { + "epoch": 1.093499554764025, + "grad_norm": 1.5060440301895142, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8766211271286011, + "num_tokens": 327983540.0, + "step": 8596 + }, + { + "epoch": 1.0936267650426155, + "grad_norm": 1.5942755937576294, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8721169233322144, + "num_tokens": 328017879.0, + "step": 8597 + }, + { + "epoch": 1.093753975321206, + "grad_norm": 1.6340632438659668, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8794184923171997, + "num_tokens": 328050273.0, + "step": 8598 + }, + { + "epoch": 1.0938811855997965, + "grad_norm": 1.6113622188568115, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.879746675491333, + "num_tokens": 328085456.0, + "step": 8599 + }, + { + "epoch": 1.094008395878387, + "grad_norm": 1.5160075426101685, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8845157623291016, + "num_tokens": 328126818.0, + "step": 8600 + }, + { + "epoch": 1.0941356061569776, + "grad_norm": 1.5891478061676025, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8705963492393494, + "num_tokens": 328163092.0, + "step": 8601 + }, + { + "epoch": 1.094262816435568, + "grad_norm": 1.408146858215332, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8925948739051819, + "num_tokens": 328200927.0, + "step": 8602 + }, + { + "epoch": 1.0943900267141584, + "grad_norm": 1.5832029581069946, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8807517290115356, + "num_tokens": 328235774.0, + "step": 8603 + }, + { + "epoch": 1.094517236992749, + "grad_norm": 1.503157615661621, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8779508471488953, + "num_tokens": 328280268.0, + "step": 8604 + }, + { + "epoch": 1.0946444472713395, + "grad_norm": 1.631900429725647, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.875415027141571, + "num_tokens": 328313667.0, + "step": 8605 + }, + { + "epoch": 1.09477165754993, + "grad_norm": 1.556494116783142, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8781880140304565, + "num_tokens": 328350758.0, + "step": 8606 + }, + { + "epoch": 1.0948988678285205, + "grad_norm": 1.611028790473938, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8782120943069458, + "num_tokens": 328389487.0, + "step": 8607 + }, + { + "epoch": 1.095026078107111, + "grad_norm": 1.5892760753631592, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8603061437606812, + "num_tokens": 328428263.0, + "step": 8608 + }, + { + "epoch": 1.0951532883857016, + "grad_norm": 1.6721291542053223, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8709123134613037, + "num_tokens": 328463131.0, + "step": 8609 + }, + { + "epoch": 1.0952804986642921, + "grad_norm": 1.582937240600586, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8786494731903076, + "num_tokens": 328498149.0, + "step": 8610 + }, + { + "epoch": 1.0954077089428826, + "grad_norm": 1.553248643875122, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8626143336296082, + "num_tokens": 328537838.0, + "step": 8611 + }, + { + "epoch": 1.0955349192214732, + "grad_norm": 1.4732387065887451, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8780887722969055, + "num_tokens": 328576126.0, + "step": 8612 + }, + { + "epoch": 1.0956621295000637, + "grad_norm": 1.620462417602539, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8570358753204346, + "num_tokens": 328612780.0, + "step": 8613 + }, + { + "epoch": 1.095789339778654, + "grad_norm": 1.5709450244903564, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8888090252876282, + "num_tokens": 328646093.0, + "step": 8614 + }, + { + "epoch": 1.0959165500572445, + "grad_norm": 1.4595812559127808, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8851557374000549, + "num_tokens": 328684211.0, + "step": 8615 + }, + { + "epoch": 1.096043760335835, + "grad_norm": 1.4548956155776978, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8799264430999756, + "num_tokens": 328722012.0, + "step": 8616 + }, + { + "epoch": 1.0961709706144256, + "grad_norm": 1.5373817682266235, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8938511610031128, + "num_tokens": 328754931.0, + "step": 8617 + }, + { + "epoch": 1.0962981808930161, + "grad_norm": 1.4543402194976807, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8774160742759705, + "num_tokens": 328796194.0, + "step": 8618 + }, + { + "epoch": 1.0964253911716066, + "grad_norm": 1.566734790802002, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8728443384170532, + "num_tokens": 328832073.0, + "step": 8619 + }, + { + "epoch": 1.0965526014501972, + "grad_norm": 1.5359433889389038, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8828290104866028, + "num_tokens": 328865928.0, + "step": 8620 + }, + { + "epoch": 1.0966798117287877, + "grad_norm": 1.5150386095046997, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8882324695587158, + "num_tokens": 328901596.0, + "step": 8621 + }, + { + "epoch": 1.0968070220073782, + "grad_norm": 1.6282106637954712, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8729032874107361, + "num_tokens": 328938624.0, + "step": 8622 + }, + { + "epoch": 1.0969342322859688, + "grad_norm": 1.4748772382736206, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8866561055183411, + "num_tokens": 328974371.0, + "step": 8623 + }, + { + "epoch": 1.0970614425645593, + "grad_norm": 1.561112880706787, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8594805002212524, + "num_tokens": 329014333.0, + "step": 8624 + }, + { + "epoch": 1.0971886528431498, + "grad_norm": 1.4449741840362549, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8870584964752197, + "num_tokens": 329058018.0, + "step": 8625 + }, + { + "epoch": 1.0973158631217403, + "grad_norm": 1.5173333883285522, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8788550496101379, + "num_tokens": 329094204.0, + "step": 8626 + }, + { + "epoch": 1.0974430734003306, + "grad_norm": 1.6767596006393433, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8794589042663574, + "num_tokens": 329125638.0, + "step": 8627 + }, + { + "epoch": 1.0975702836789212, + "grad_norm": 1.6593414545059204, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.851034939289093, + "num_tokens": 329163228.0, + "step": 8628 + }, + { + "epoch": 1.0976974939575117, + "grad_norm": 1.5741806030273438, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8927640318870544, + "num_tokens": 329199088.0, + "step": 8629 + }, + { + "epoch": 1.0978247042361022, + "grad_norm": 1.5218809843063354, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8713851571083069, + "num_tokens": 329240021.0, + "step": 8630 + }, + { + "epoch": 1.0979519145146928, + "grad_norm": 1.5883846282958984, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8778399229049683, + "num_tokens": 329273407.0, + "step": 8631 + }, + { + "epoch": 1.0980791247932833, + "grad_norm": 1.5155121088027954, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.881966233253479, + "num_tokens": 329312258.0, + "step": 8632 + }, + { + "epoch": 1.0982063350718738, + "grad_norm": 1.473893165588379, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8656290173530579, + "num_tokens": 329354670.0, + "step": 8633 + }, + { + "epoch": 1.0983335453504643, + "grad_norm": 1.5832685232162476, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8770767450332642, + "num_tokens": 329389910.0, + "step": 8634 + }, + { + "epoch": 1.0984607556290549, + "grad_norm": 1.6581109762191772, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8625877499580383, + "num_tokens": 329425112.0, + "step": 8635 + }, + { + "epoch": 1.0985879659076454, + "grad_norm": 1.680361032485962, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8699249029159546, + "num_tokens": 329463202.0, + "step": 8636 + }, + { + "epoch": 1.098715176186236, + "grad_norm": 1.4725966453552246, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.888083279132843, + "num_tokens": 329501737.0, + "step": 8637 + }, + { + "epoch": 1.0988423864648265, + "grad_norm": 1.4209641218185425, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8747831583023071, + "num_tokens": 329545854.0, + "step": 8638 + }, + { + "epoch": 1.0989695967434168, + "grad_norm": 1.3616275787353516, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8627709150314331, + "num_tokens": 329594404.0, + "step": 8639 + }, + { + "epoch": 1.0990968070220073, + "grad_norm": 1.361482858657837, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8931903839111328, + "num_tokens": 329636214.0, + "step": 8640 + }, + { + "epoch": 1.0992240173005978, + "grad_norm": 1.437403678894043, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8766101002693176, + "num_tokens": 329677164.0, + "step": 8641 + }, + { + "epoch": 1.0993512275791884, + "grad_norm": 1.3808234930038452, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8854824900627136, + "num_tokens": 329717467.0, + "step": 8642 + }, + { + "epoch": 1.0994784378577789, + "grad_norm": 1.5959621667861938, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8793761134147644, + "num_tokens": 329751542.0, + "step": 8643 + }, + { + "epoch": 1.0996056481363694, + "grad_norm": 1.5021028518676758, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8836156725883484, + "num_tokens": 329788350.0, + "step": 8644 + }, + { + "epoch": 1.09973285841496, + "grad_norm": 1.5786112546920776, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8671137094497681, + "num_tokens": 329830760.0, + "step": 8645 + }, + { + "epoch": 1.0998600686935505, + "grad_norm": 1.665930151939392, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8848080635070801, + "num_tokens": 329864045.0, + "step": 8646 + }, + { + "epoch": 1.099987278972141, + "grad_norm": 1.531701683998108, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8551941514015198, + "num_tokens": 329903809.0, + "step": 8647 + }, + { + "epoch": 1.1001144892507315, + "grad_norm": 1.4772688150405884, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8915817141532898, + "num_tokens": 329936749.0, + "step": 8648 + }, + { + "epoch": 1.100241699529322, + "grad_norm": 1.639701247215271, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8721947073936462, + "num_tokens": 329970810.0, + "step": 8649 + }, + { + "epoch": 1.1003689098079126, + "grad_norm": 1.6505247354507446, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8743060827255249, + "num_tokens": 330007162.0, + "step": 8650 + }, + { + "epoch": 1.100496120086503, + "grad_norm": 1.5512502193450928, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8735195398330688, + "num_tokens": 330045015.0, + "step": 8651 + }, + { + "epoch": 1.1006233303650934, + "grad_norm": 1.8496770858764648, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8834762573242188, + "num_tokens": 330077195.0, + "step": 8652 + }, + { + "epoch": 1.100750540643684, + "grad_norm": 1.547234058380127, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8719322681427002, + "num_tokens": 330116453.0, + "step": 8653 + }, + { + "epoch": 1.1008777509222745, + "grad_norm": 1.537020206451416, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8685861825942993, + "num_tokens": 330154920.0, + "step": 8654 + }, + { + "epoch": 1.101004961200865, + "grad_norm": 1.5500727891921997, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8632541298866272, + "num_tokens": 330197525.0, + "step": 8655 + }, + { + "epoch": 1.1011321714794555, + "grad_norm": 1.569176435470581, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8740619421005249, + "num_tokens": 330233537.0, + "step": 8656 + }, + { + "epoch": 1.101259381758046, + "grad_norm": 1.4006294012069702, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8764126896858215, + "num_tokens": 330277521.0, + "step": 8657 + }, + { + "epoch": 1.1013865920366366, + "grad_norm": 1.6101528406143188, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8702457547187805, + "num_tokens": 330316752.0, + "step": 8658 + }, + { + "epoch": 1.101513802315227, + "grad_norm": 1.5590723752975464, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8788639307022095, + "num_tokens": 330352219.0, + "step": 8659 + }, + { + "epoch": 1.1016410125938176, + "grad_norm": 1.5676798820495605, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8748830556869507, + "num_tokens": 330388993.0, + "step": 8660 + }, + { + "epoch": 1.1017682228724082, + "grad_norm": 1.495214819908142, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8700940012931824, + "num_tokens": 330431002.0, + "step": 8661 + }, + { + "epoch": 1.1018954331509987, + "grad_norm": 1.6358331441879272, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8767291307449341, + "num_tokens": 330466500.0, + "step": 8662 + }, + { + "epoch": 1.102022643429589, + "grad_norm": 1.4430670738220215, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8759380578994751, + "num_tokens": 330504852.0, + "step": 8663 + }, + { + "epoch": 1.1021498537081795, + "grad_norm": 1.6782448291778564, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8636147975921631, + "num_tokens": 330540281.0, + "step": 8664 + }, + { + "epoch": 1.10227706398677, + "grad_norm": 1.5446982383728027, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8647105097770691, + "num_tokens": 330581022.0, + "step": 8665 + }, + { + "epoch": 1.1024042742653606, + "grad_norm": 1.5067379474639893, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8871927857398987, + "num_tokens": 330617047.0, + "step": 8666 + }, + { + "epoch": 1.1025314845439511, + "grad_norm": 1.6884812116622925, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.871346652507782, + "num_tokens": 330650552.0, + "step": 8667 + }, + { + "epoch": 1.1026586948225416, + "grad_norm": 1.6206706762313843, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.889396071434021, + "num_tokens": 330684547.0, + "step": 8668 + }, + { + "epoch": 1.1027859051011322, + "grad_norm": 1.3960649967193604, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8834389448165894, + "num_tokens": 330728757.0, + "step": 8669 + }, + { + "epoch": 1.1029131153797227, + "grad_norm": 1.6450828313827515, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8958628177642822, + "num_tokens": 330765082.0, + "step": 8670 + }, + { + "epoch": 1.1030403256583132, + "grad_norm": 1.550835132598877, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8694599866867065, + "num_tokens": 330804055.0, + "step": 8671 + }, + { + "epoch": 1.1031675359369038, + "grad_norm": 1.5428502559661865, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8724279403686523, + "num_tokens": 330840882.0, + "step": 8672 + }, + { + "epoch": 1.1032947462154943, + "grad_norm": 1.4759421348571777, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8809611201286316, + "num_tokens": 330880451.0, + "step": 8673 + }, + { + "epoch": 1.1034219564940848, + "grad_norm": 1.521437644958496, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.872444748878479, + "num_tokens": 330917435.0, + "step": 8674 + }, + { + "epoch": 1.1035491667726753, + "grad_norm": 1.6471294164657593, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8814831376075745, + "num_tokens": 330957411.0, + "step": 8675 + }, + { + "epoch": 1.1036763770512656, + "grad_norm": 1.6605421304702759, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8763049840927124, + "num_tokens": 330988039.0, + "step": 8676 + }, + { + "epoch": 1.1038035873298562, + "grad_norm": 1.5939618349075317, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.872127115726471, + "num_tokens": 331025170.0, + "step": 8677 + }, + { + "epoch": 1.1039307976084467, + "grad_norm": 1.4640213251113892, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.874585747718811, + "num_tokens": 331068132.0, + "step": 8678 + }, + { + "epoch": 1.1040580078870372, + "grad_norm": 1.4687376022338867, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.875109076499939, + "num_tokens": 331105540.0, + "step": 8679 + }, + { + "epoch": 1.1041852181656278, + "grad_norm": 1.679985761642456, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.875356912612915, + "num_tokens": 331139545.0, + "step": 8680 + }, + { + "epoch": 1.1043124284442183, + "grad_norm": 1.6029869318008423, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8613694906234741, + "num_tokens": 331176610.0, + "step": 8681 + }, + { + "epoch": 1.1044396387228088, + "grad_norm": 1.7251170873641968, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8815699815750122, + "num_tokens": 331213005.0, + "step": 8682 + }, + { + "epoch": 1.1045668490013993, + "grad_norm": 1.6283507347106934, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8683038949966431, + "num_tokens": 331255284.0, + "step": 8683 + }, + { + "epoch": 1.1046940592799899, + "grad_norm": 1.5417203903198242, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.868369460105896, + "num_tokens": 331292596.0, + "step": 8684 + }, + { + "epoch": 1.1048212695585804, + "grad_norm": 1.4950209856033325, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8716699481010437, + "num_tokens": 331331773.0, + "step": 8685 + }, + { + "epoch": 1.104948479837171, + "grad_norm": 1.6652289628982544, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.878231406211853, + "num_tokens": 331366541.0, + "step": 8686 + }, + { + "epoch": 1.1050756901157615, + "grad_norm": 1.3853167295455933, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8837695121765137, + "num_tokens": 331408084.0, + "step": 8687 + }, + { + "epoch": 1.1052029003943518, + "grad_norm": 1.5139193534851074, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8730744123458862, + "num_tokens": 331444805.0, + "step": 8688 + }, + { + "epoch": 1.1053301106729423, + "grad_norm": 1.5159424543380737, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8782486915588379, + "num_tokens": 331480999.0, + "step": 8689 + }, + { + "epoch": 1.1054573209515328, + "grad_norm": 1.5955698490142822, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8608490228652954, + "num_tokens": 331516185.0, + "step": 8690 + }, + { + "epoch": 1.1055845312301233, + "grad_norm": 1.5431551933288574, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8744504451751709, + "num_tokens": 331553420.0, + "step": 8691 + }, + { + "epoch": 1.1057117415087139, + "grad_norm": 1.6299885511398315, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8801355957984924, + "num_tokens": 331585151.0, + "step": 8692 + }, + { + "epoch": 1.1058389517873044, + "grad_norm": 1.408445954322815, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8724560737609863, + "num_tokens": 331628742.0, + "step": 8693 + }, + { + "epoch": 1.105966162065895, + "grad_norm": 1.535204529762268, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8728134632110596, + "num_tokens": 331666660.0, + "step": 8694 + }, + { + "epoch": 1.1060933723444855, + "grad_norm": 1.4310914278030396, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8808910846710205, + "num_tokens": 331709759.0, + "step": 8695 + }, + { + "epoch": 1.106220582623076, + "grad_norm": 1.530213713645935, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8830097913742065, + "num_tokens": 331743947.0, + "step": 8696 + }, + { + "epoch": 1.1063477929016665, + "grad_norm": 1.5297813415527344, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8812190294265747, + "num_tokens": 331780379.0, + "step": 8697 + }, + { + "epoch": 1.106475003180257, + "grad_norm": 1.5524555444717407, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8729863166809082, + "num_tokens": 331823250.0, + "step": 8698 + }, + { + "epoch": 1.1066022134588476, + "grad_norm": 1.5869418382644653, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8900766372680664, + "num_tokens": 331858047.0, + "step": 8699 + }, + { + "epoch": 1.106729423737438, + "grad_norm": 1.605852484703064, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8610086441040039, + "num_tokens": 331897236.0, + "step": 8700 + }, + { + "epoch": 1.1068566340160284, + "grad_norm": 1.5651906728744507, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8599506616592407, + "num_tokens": 331936293.0, + "step": 8701 + }, + { + "epoch": 1.106983844294619, + "grad_norm": 1.4380155801773071, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8824250102043152, + "num_tokens": 331976356.0, + "step": 8702 + }, + { + "epoch": 1.1071110545732095, + "grad_norm": 1.4008129835128784, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8863036632537842, + "num_tokens": 332016875.0, + "step": 8703 + }, + { + "epoch": 1.1072382648518, + "grad_norm": 1.7853423357009888, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8778654336929321, + "num_tokens": 332051863.0, + "step": 8704 + }, + { + "epoch": 1.1073654751303905, + "grad_norm": 1.5731749534606934, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8777884244918823, + "num_tokens": 332088431.0, + "step": 8705 + }, + { + "epoch": 1.107492685408981, + "grad_norm": 1.5329525470733643, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8804793357849121, + "num_tokens": 332124861.0, + "step": 8706 + }, + { + "epoch": 1.1076198956875716, + "grad_norm": 1.4271188974380493, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8828864693641663, + "num_tokens": 332165629.0, + "step": 8707 + }, + { + "epoch": 1.107747105966162, + "grad_norm": 1.768234133720398, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8673105239868164, + "num_tokens": 332196761.0, + "step": 8708 + }, + { + "epoch": 1.1078743162447526, + "grad_norm": 1.513157606124878, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8882045745849609, + "num_tokens": 332232964.0, + "step": 8709 + }, + { + "epoch": 1.1080015265233432, + "grad_norm": 1.6540400981903076, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8822003602981567, + "num_tokens": 332266410.0, + "step": 8710 + }, + { + "epoch": 1.1081287368019337, + "grad_norm": 1.6597654819488525, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8951267004013062, + "num_tokens": 332301881.0, + "step": 8711 + }, + { + "epoch": 1.108255947080524, + "grad_norm": 1.4466172456741333, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8754212856292725, + "num_tokens": 332343281.0, + "step": 8712 + }, + { + "epoch": 1.1083831573591145, + "grad_norm": 1.4541271924972534, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8848885893821716, + "num_tokens": 332380584.0, + "step": 8713 + }, + { + "epoch": 1.108510367637705, + "grad_norm": 1.5741121768951416, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8745099902153015, + "num_tokens": 332417558.0, + "step": 8714 + }, + { + "epoch": 1.1086375779162956, + "grad_norm": 1.5567268133163452, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8788599967956543, + "num_tokens": 332453057.0, + "step": 8715 + }, + { + "epoch": 1.108764788194886, + "grad_norm": 1.4105533361434937, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8809510469436646, + "num_tokens": 332499230.0, + "step": 8716 + }, + { + "epoch": 1.1088919984734766, + "grad_norm": 1.5784496068954468, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8805925846099854, + "num_tokens": 332534378.0, + "step": 8717 + }, + { + "epoch": 1.1090192087520672, + "grad_norm": 1.44203519821167, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8809612989425659, + "num_tokens": 332571796.0, + "step": 8718 + }, + { + "epoch": 1.1091464190306577, + "grad_norm": 1.5227313041687012, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8661927580833435, + "num_tokens": 332609677.0, + "step": 8719 + }, + { + "epoch": 1.1092736293092482, + "grad_norm": 1.5216666460037231, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8782715797424316, + "num_tokens": 332643682.0, + "step": 8720 + }, + { + "epoch": 1.1094008395878387, + "grad_norm": 1.5907824039459229, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8806331157684326, + "num_tokens": 332681241.0, + "step": 8721 + }, + { + "epoch": 1.1095280498664293, + "grad_norm": 1.463301658630371, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8762458562850952, + "num_tokens": 332719906.0, + "step": 8722 + }, + { + "epoch": 1.1096552601450198, + "grad_norm": 1.446842908859253, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8863574862480164, + "num_tokens": 332759654.0, + "step": 8723 + }, + { + "epoch": 1.1097824704236103, + "grad_norm": 1.4364116191864014, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8855651617050171, + "num_tokens": 332803099.0, + "step": 8724 + }, + { + "epoch": 1.1099096807022006, + "grad_norm": 1.6773673295974731, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8845435976982117, + "num_tokens": 332833892.0, + "step": 8725 + }, + { + "epoch": 1.1100368909807912, + "grad_norm": 1.4228755235671997, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8695083856582642, + "num_tokens": 332876963.0, + "step": 8726 + }, + { + "epoch": 1.1101641012593817, + "grad_norm": 1.5529707670211792, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8586055636405945, + "num_tokens": 332917007.0, + "step": 8727 + }, + { + "epoch": 1.1102913115379722, + "grad_norm": 1.655644416809082, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8672112226486206, + "num_tokens": 332952445.0, + "step": 8728 + }, + { + "epoch": 1.1104185218165628, + "grad_norm": 1.65617835521698, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8674858212471008, + "num_tokens": 332988758.0, + "step": 8729 + }, + { + "epoch": 1.1105457320951533, + "grad_norm": 1.6775085926055908, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8557779788970947, + "num_tokens": 333025044.0, + "step": 8730 + }, + { + "epoch": 1.1106729423737438, + "grad_norm": 1.651624321937561, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.865009069442749, + "num_tokens": 333065836.0, + "step": 8731 + }, + { + "epoch": 1.1108001526523343, + "grad_norm": 1.6945314407348633, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8588838577270508, + "num_tokens": 333099316.0, + "step": 8732 + }, + { + "epoch": 1.1109273629309249, + "grad_norm": 1.5747917890548706, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8775370717048645, + "num_tokens": 333137506.0, + "step": 8733 + }, + { + "epoch": 1.1110545732095154, + "grad_norm": 1.5034816265106201, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8772037625312805, + "num_tokens": 333179171.0, + "step": 8734 + }, + { + "epoch": 1.111181783488106, + "grad_norm": 1.5831196308135986, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8838539123535156, + "num_tokens": 333215116.0, + "step": 8735 + }, + { + "epoch": 1.1113089937666965, + "grad_norm": 1.4020037651062012, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8678091764450073, + "num_tokens": 333260716.0, + "step": 8736 + }, + { + "epoch": 1.1114362040452868, + "grad_norm": 1.5840309858322144, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8756555318832397, + "num_tokens": 333297923.0, + "step": 8737 + }, + { + "epoch": 1.1115634143238773, + "grad_norm": 1.571953535079956, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8810533881187439, + "num_tokens": 333338239.0, + "step": 8738 + }, + { + "epoch": 1.1116906246024678, + "grad_norm": 1.622728943824768, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8891129493713379, + "num_tokens": 333366885.0, + "step": 8739 + }, + { + "epoch": 1.1118178348810583, + "grad_norm": 1.4814525842666626, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8736361265182495, + "num_tokens": 333408040.0, + "step": 8740 + }, + { + "epoch": 1.1119450451596489, + "grad_norm": 1.6736302375793457, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8879058361053467, + "num_tokens": 333440950.0, + "step": 8741 + }, + { + "epoch": 1.1120722554382394, + "grad_norm": 1.4368095397949219, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8851961493492126, + "num_tokens": 333482307.0, + "step": 8742 + }, + { + "epoch": 1.11219946571683, + "grad_norm": 1.70734703540802, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8544378280639648, + "num_tokens": 333516312.0, + "step": 8743 + }, + { + "epoch": 1.1123266759954205, + "grad_norm": 1.5653854608535767, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8712746500968933, + "num_tokens": 333550783.0, + "step": 8744 + }, + { + "epoch": 1.112453886274011, + "grad_norm": 1.394493818283081, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8787490725517273, + "num_tokens": 333593466.0, + "step": 8745 + }, + { + "epoch": 1.1125810965526015, + "grad_norm": 1.6199928522109985, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8709287047386169, + "num_tokens": 333630663.0, + "step": 8746 + }, + { + "epoch": 1.112708306831192, + "grad_norm": 1.6391797065734863, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8796842098236084, + "num_tokens": 333661225.0, + "step": 8747 + }, + { + "epoch": 1.1128355171097826, + "grad_norm": 1.461006760597229, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8784655928611755, + "num_tokens": 333697654.0, + "step": 8748 + }, + { + "epoch": 1.112962727388373, + "grad_norm": 1.4494510889053345, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8647086024284363, + "num_tokens": 333739959.0, + "step": 8749 + }, + { + "epoch": 1.1130899376669634, + "grad_norm": 1.575762152671814, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8787661790847778, + "num_tokens": 333776603.0, + "step": 8750 + }, + { + "epoch": 1.113217147945554, + "grad_norm": 1.537832260131836, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8720308542251587, + "num_tokens": 333815175.0, + "step": 8751 + }, + { + "epoch": 1.1133443582241445, + "grad_norm": 1.571750283241272, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8774216175079346, + "num_tokens": 333855016.0, + "step": 8752 + }, + { + "epoch": 1.113471568502735, + "grad_norm": 1.4448318481445312, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8812577724456787, + "num_tokens": 333894051.0, + "step": 8753 + }, + { + "epoch": 1.1135987787813255, + "grad_norm": 1.5992664098739624, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8831958770751953, + "num_tokens": 333928164.0, + "step": 8754 + }, + { + "epoch": 1.113725989059916, + "grad_norm": 1.4739521741867065, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8681679368019104, + "num_tokens": 333969966.0, + "step": 8755 + }, + { + "epoch": 1.1138531993385066, + "grad_norm": 1.5676568746566772, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8648908138275146, + "num_tokens": 334008949.0, + "step": 8756 + }, + { + "epoch": 1.113980409617097, + "grad_norm": 1.4481202363967896, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8773930668830872, + "num_tokens": 334052662.0, + "step": 8757 + }, + { + "epoch": 1.1141076198956876, + "grad_norm": 1.506432294845581, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8843421339988708, + "num_tokens": 334087107.0, + "step": 8758 + }, + { + "epoch": 1.1142348301742782, + "grad_norm": 1.5452282428741455, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8814030885696411, + "num_tokens": 334120428.0, + "step": 8759 + }, + { + "epoch": 1.1143620404528687, + "grad_norm": 1.5089354515075684, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8813145160675049, + "num_tokens": 334159818.0, + "step": 8760 + }, + { + "epoch": 1.114489250731459, + "grad_norm": 1.4733201265335083, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8693685531616211, + "num_tokens": 334200115.0, + "step": 8761 + }, + { + "epoch": 1.1146164610100495, + "grad_norm": 1.6401487588882446, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8689658641815186, + "num_tokens": 334239193.0, + "step": 8762 + }, + { + "epoch": 1.11474367128864, + "grad_norm": 1.6353658437728882, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.856310248374939, + "num_tokens": 334277587.0, + "step": 8763 + }, + { + "epoch": 1.1148708815672306, + "grad_norm": 1.5099120140075684, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8742937445640564, + "num_tokens": 334314247.0, + "step": 8764 + }, + { + "epoch": 1.114998091845821, + "grad_norm": 1.7899200916290283, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8688563108444214, + "num_tokens": 334349605.0, + "step": 8765 + }, + { + "epoch": 1.1151253021244116, + "grad_norm": 1.6811094284057617, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8738744258880615, + "num_tokens": 334380462.0, + "step": 8766 + }, + { + "epoch": 1.1152525124030022, + "grad_norm": 1.5540492534637451, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8647741675376892, + "num_tokens": 334416317.0, + "step": 8767 + }, + { + "epoch": 1.1153797226815927, + "grad_norm": 1.369728684425354, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.876900851726532, + "num_tokens": 334459185.0, + "step": 8768 + }, + { + "epoch": 1.1155069329601832, + "grad_norm": 1.5123034715652466, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8744196891784668, + "num_tokens": 334497616.0, + "step": 8769 + }, + { + "epoch": 1.1156341432387737, + "grad_norm": 1.520203709602356, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8694053292274475, + "num_tokens": 334534796.0, + "step": 8770 + }, + { + "epoch": 1.1157613535173643, + "grad_norm": 1.5832180976867676, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8705018758773804, + "num_tokens": 334570204.0, + "step": 8771 + }, + { + "epoch": 1.1158885637959548, + "grad_norm": 1.539350986480713, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8573266267776489, + "num_tokens": 334611966.0, + "step": 8772 + }, + { + "epoch": 1.1160157740745453, + "grad_norm": 1.4349297285079956, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8849724531173706, + "num_tokens": 334654483.0, + "step": 8773 + }, + { + "epoch": 1.1161429843531356, + "grad_norm": 1.5842201709747314, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8831709027290344, + "num_tokens": 334687521.0, + "step": 8774 + }, + { + "epoch": 1.1162701946317262, + "grad_norm": 1.5417490005493164, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8722741603851318, + "num_tokens": 334723256.0, + "step": 8775 + }, + { + "epoch": 1.1163974049103167, + "grad_norm": 1.4420931339263916, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8835161924362183, + "num_tokens": 334762355.0, + "step": 8776 + }, + { + "epoch": 1.1165246151889072, + "grad_norm": 1.608023762702942, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8739463090896606, + "num_tokens": 334799245.0, + "step": 8777 + }, + { + "epoch": 1.1166518254674977, + "grad_norm": 1.8143556118011475, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8765863180160522, + "num_tokens": 334830953.0, + "step": 8778 + }, + { + "epoch": 1.1167790357460883, + "grad_norm": 1.6141210794448853, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8737545013427734, + "num_tokens": 334863772.0, + "step": 8779 + }, + { + "epoch": 1.1169062460246788, + "grad_norm": 1.439728856086731, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8683594465255737, + "num_tokens": 334907194.0, + "step": 8780 + }, + { + "epoch": 1.1170334563032693, + "grad_norm": 1.651915192604065, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8655587434768677, + "num_tokens": 334943637.0, + "step": 8781 + }, + { + "epoch": 1.1171606665818599, + "grad_norm": 1.6362032890319824, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8710334300994873, + "num_tokens": 334982213.0, + "step": 8782 + }, + { + "epoch": 1.1172878768604504, + "grad_norm": 1.6410152912139893, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8841038942337036, + "num_tokens": 335019636.0, + "step": 8783 + }, + { + "epoch": 1.117415087139041, + "grad_norm": 1.4492286443710327, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8766646385192871, + "num_tokens": 335059904.0, + "step": 8784 + }, + { + "epoch": 1.1175422974176314, + "grad_norm": 1.4875991344451904, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8869532346725464, + "num_tokens": 335095554.0, + "step": 8785 + }, + { + "epoch": 1.1176695076962218, + "grad_norm": 1.5022858381271362, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8724973201751709, + "num_tokens": 335138469.0, + "step": 8786 + }, + { + "epoch": 1.1177967179748123, + "grad_norm": 1.5374600887298584, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8786284923553467, + "num_tokens": 335176790.0, + "step": 8787 + }, + { + "epoch": 1.1179239282534028, + "grad_norm": 1.432013750076294, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8869961500167847, + "num_tokens": 335216766.0, + "step": 8788 + }, + { + "epoch": 1.1180511385319933, + "grad_norm": 1.5230637788772583, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.883766233921051, + "num_tokens": 335254631.0, + "step": 8789 + }, + { + "epoch": 1.1181783488105839, + "grad_norm": 1.4659463167190552, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.87969970703125, + "num_tokens": 335295763.0, + "step": 8790 + }, + { + "epoch": 1.1183055590891744, + "grad_norm": 1.5998259782791138, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8703218698501587, + "num_tokens": 335331327.0, + "step": 8791 + }, + { + "epoch": 1.118432769367765, + "grad_norm": 1.4503002166748047, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8855484127998352, + "num_tokens": 335366430.0, + "step": 8792 + }, + { + "epoch": 1.1185599796463555, + "grad_norm": 1.53936767578125, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8797646164894104, + "num_tokens": 335402633.0, + "step": 8793 + }, + { + "epoch": 1.118687189924946, + "grad_norm": 1.4491031169891357, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8698588013648987, + "num_tokens": 335443495.0, + "step": 8794 + }, + { + "epoch": 1.1188144002035365, + "grad_norm": 1.5764985084533691, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.885732114315033, + "num_tokens": 335477558.0, + "step": 8795 + }, + { + "epoch": 1.118941610482127, + "grad_norm": 1.593862533569336, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8713507652282715, + "num_tokens": 335511282.0, + "step": 8796 + }, + { + "epoch": 1.1190688207607176, + "grad_norm": 1.6288185119628906, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8610565662384033, + "num_tokens": 335546617.0, + "step": 8797 + }, + { + "epoch": 1.119196031039308, + "grad_norm": 1.492740511894226, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8774304986000061, + "num_tokens": 335587102.0, + "step": 8798 + }, + { + "epoch": 1.1193232413178984, + "grad_norm": 1.472307562828064, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8783751726150513, + "num_tokens": 335626680.0, + "step": 8799 + }, + { + "epoch": 1.119450451596489, + "grad_norm": 1.6249401569366455, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8641008734703064, + "num_tokens": 335661528.0, + "step": 8800 + }, + { + "epoch": 1.1195776618750795, + "grad_norm": 1.598397135734558, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8663160800933838, + "num_tokens": 335696834.0, + "step": 8801 + }, + { + "epoch": 1.11970487215367, + "grad_norm": 1.5646610260009766, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8714369535446167, + "num_tokens": 335735245.0, + "step": 8802 + }, + { + "epoch": 1.1198320824322605, + "grad_norm": 1.6358585357666016, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8666460514068604, + "num_tokens": 335773355.0, + "step": 8803 + }, + { + "epoch": 1.119959292710851, + "grad_norm": 1.5277221202850342, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8711292743682861, + "num_tokens": 335811667.0, + "step": 8804 + }, + { + "epoch": 1.1200865029894416, + "grad_norm": 1.5175659656524658, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8502931594848633, + "num_tokens": 335856233.0, + "step": 8805 + }, + { + "epoch": 1.120213713268032, + "grad_norm": 1.543298363685608, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.883087158203125, + "num_tokens": 335891500.0, + "step": 8806 + }, + { + "epoch": 1.1203409235466226, + "grad_norm": 1.4984742403030396, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8776552677154541, + "num_tokens": 335928204.0, + "step": 8807 + }, + { + "epoch": 1.1204681338252132, + "grad_norm": 1.4633657932281494, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8635891675949097, + "num_tokens": 335972065.0, + "step": 8808 + }, + { + "epoch": 1.1205953441038037, + "grad_norm": 1.5665345191955566, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8739171028137207, + "num_tokens": 336009299.0, + "step": 8809 + }, + { + "epoch": 1.120722554382394, + "grad_norm": 1.49187171459198, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8832369446754456, + "num_tokens": 336047305.0, + "step": 8810 + }, + { + "epoch": 1.1208497646609845, + "grad_norm": 1.5186270475387573, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8723234534263611, + "num_tokens": 336085754.0, + "step": 8811 + }, + { + "epoch": 1.120976974939575, + "grad_norm": 1.4219110012054443, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8790193796157837, + "num_tokens": 336127001.0, + "step": 8812 + }, + { + "epoch": 1.1211041852181656, + "grad_norm": 1.6212093830108643, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8719591498374939, + "num_tokens": 336159851.0, + "step": 8813 + }, + { + "epoch": 1.121231395496756, + "grad_norm": 1.4567739963531494, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8656358122825623, + "num_tokens": 336202584.0, + "step": 8814 + }, + { + "epoch": 1.1213586057753466, + "grad_norm": 1.3935918807983398, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8841668367385864, + "num_tokens": 336244233.0, + "step": 8815 + }, + { + "epoch": 1.1214858160539372, + "grad_norm": 1.5195746421813965, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.878951907157898, + "num_tokens": 336279355.0, + "step": 8816 + }, + { + "epoch": 1.1216130263325277, + "grad_norm": 1.5462157726287842, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8631844520568848, + "num_tokens": 336317525.0, + "step": 8817 + }, + { + "epoch": 1.1217402366111182, + "grad_norm": 1.4481662511825562, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.886286199092865, + "num_tokens": 336353217.0, + "step": 8818 + }, + { + "epoch": 1.1218674468897087, + "grad_norm": 1.4722018241882324, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8784854412078857, + "num_tokens": 336392708.0, + "step": 8819 + }, + { + "epoch": 1.1219946571682993, + "grad_norm": 1.4895663261413574, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8715823888778687, + "num_tokens": 336431739.0, + "step": 8820 + }, + { + "epoch": 1.1221218674468898, + "grad_norm": 1.4685871601104736, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8782291412353516, + "num_tokens": 336471003.0, + "step": 8821 + }, + { + "epoch": 1.1222490777254803, + "grad_norm": 1.5191563367843628, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8586190342903137, + "num_tokens": 336512781.0, + "step": 8822 + }, + { + "epoch": 1.1223762880040706, + "grad_norm": 1.3428521156311035, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8811403512954712, + "num_tokens": 336556132.0, + "step": 8823 + }, + { + "epoch": 1.1225034982826612, + "grad_norm": 1.4694913625717163, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.864217221736908, + "num_tokens": 336597987.0, + "step": 8824 + }, + { + "epoch": 1.1226307085612517, + "grad_norm": 1.4689526557922363, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8834598064422607, + "num_tokens": 336639389.0, + "step": 8825 + }, + { + "epoch": 1.1227579188398422, + "grad_norm": 1.6330655813217163, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.873103141784668, + "num_tokens": 336673366.0, + "step": 8826 + }, + { + "epoch": 1.1228851291184327, + "grad_norm": 1.465225100517273, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8946696519851685, + "num_tokens": 336711614.0, + "step": 8827 + }, + { + "epoch": 1.1230123393970233, + "grad_norm": 1.5808314085006714, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8782081604003906, + "num_tokens": 336749536.0, + "step": 8828 + }, + { + "epoch": 1.1231395496756138, + "grad_norm": 1.722200632095337, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8661508560180664, + "num_tokens": 336780989.0, + "step": 8829 + }, + { + "epoch": 1.1232667599542043, + "grad_norm": 1.5914266109466553, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8781878352165222, + "num_tokens": 336818437.0, + "step": 8830 + }, + { + "epoch": 1.1233939702327949, + "grad_norm": 1.427582859992981, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8797585964202881, + "num_tokens": 336861923.0, + "step": 8831 + }, + { + "epoch": 1.1235211805113854, + "grad_norm": 1.5415151119232178, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8802985548973083, + "num_tokens": 336898766.0, + "step": 8832 + }, + { + "epoch": 1.123648390789976, + "grad_norm": 1.645715355873108, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8855229616165161, + "num_tokens": 336929787.0, + "step": 8833 + }, + { + "epoch": 1.1237756010685664, + "grad_norm": 1.6369214057922363, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.882804811000824, + "num_tokens": 336962677.0, + "step": 8834 + }, + { + "epoch": 1.1239028113471567, + "grad_norm": 1.535250186920166, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8884626030921936, + "num_tokens": 336994346.0, + "step": 8835 + }, + { + "epoch": 1.1240300216257473, + "grad_norm": 1.4535648822784424, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8627279996871948, + "num_tokens": 337037311.0, + "step": 8836 + }, + { + "epoch": 1.1241572319043378, + "grad_norm": 1.4929267168045044, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.870624303817749, + "num_tokens": 337075964.0, + "step": 8837 + }, + { + "epoch": 1.1242844421829283, + "grad_norm": 1.5221936702728271, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8814796209335327, + "num_tokens": 337111687.0, + "step": 8838 + }, + { + "epoch": 1.1244116524615189, + "grad_norm": 1.4690155982971191, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.864914059638977, + "num_tokens": 337151034.0, + "step": 8839 + }, + { + "epoch": 1.1245388627401094, + "grad_norm": 1.6331357955932617, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8573993444442749, + "num_tokens": 337185981.0, + "step": 8840 + }, + { + "epoch": 1.1246660730187, + "grad_norm": 1.4925537109375, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8905366659164429, + "num_tokens": 337222496.0, + "step": 8841 + }, + { + "epoch": 1.1247932832972904, + "grad_norm": 1.5535959005355835, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8762624263763428, + "num_tokens": 337261082.0, + "step": 8842 + }, + { + "epoch": 1.124920493575881, + "grad_norm": 1.5232248306274414, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8834376335144043, + "num_tokens": 337295623.0, + "step": 8843 + }, + { + "epoch": 1.1250477038544715, + "grad_norm": 1.534911036491394, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8762202262878418, + "num_tokens": 337332212.0, + "step": 8844 + }, + { + "epoch": 1.125174914133062, + "grad_norm": 1.551002025604248, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8768951892852783, + "num_tokens": 337368539.0, + "step": 8845 + }, + { + "epoch": 1.1253021244116526, + "grad_norm": 1.4273604154586792, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8662832975387573, + "num_tokens": 337415307.0, + "step": 8846 + }, + { + "epoch": 1.125429334690243, + "grad_norm": 1.4721343517303467, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8860102891921997, + "num_tokens": 337455390.0, + "step": 8847 + }, + { + "epoch": 1.1255565449688334, + "grad_norm": 1.5031495094299316, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8794313669204712, + "num_tokens": 337492554.0, + "step": 8848 + }, + { + "epoch": 1.125683755247424, + "grad_norm": 1.9046684503555298, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8587968349456787, + "num_tokens": 337526425.0, + "step": 8849 + }, + { + "epoch": 1.1258109655260145, + "grad_norm": 1.6283379793167114, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8727326989173889, + "num_tokens": 337562001.0, + "step": 8850 + }, + { + "epoch": 1.125938175804605, + "grad_norm": 1.521602988243103, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8741142749786377, + "num_tokens": 337601613.0, + "step": 8851 + }, + { + "epoch": 1.1260653860831955, + "grad_norm": 1.4806840419769287, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8820984959602356, + "num_tokens": 337641875.0, + "step": 8852 + }, + { + "epoch": 1.126192596361786, + "grad_norm": 1.5528976917266846, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.864970326423645, + "num_tokens": 337679406.0, + "step": 8853 + }, + { + "epoch": 1.1263198066403766, + "grad_norm": 1.6647580862045288, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8634423017501831, + "num_tokens": 337714082.0, + "step": 8854 + }, + { + "epoch": 1.126447016918967, + "grad_norm": 1.54352605342865, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8727492094039917, + "num_tokens": 337749446.0, + "step": 8855 + }, + { + "epoch": 1.1265742271975576, + "grad_norm": 1.498815655708313, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8689998388290405, + "num_tokens": 337790841.0, + "step": 8856 + }, + { + "epoch": 1.1267014374761481, + "grad_norm": 1.4896650314331055, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8751047253608704, + "num_tokens": 337828992.0, + "step": 8857 + }, + { + "epoch": 1.1268286477547387, + "grad_norm": 1.7727832794189453, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8787380456924438, + "num_tokens": 337856651.0, + "step": 8858 + }, + { + "epoch": 1.126955858033329, + "grad_norm": 1.7326120138168335, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8781431913375854, + "num_tokens": 337890475.0, + "step": 8859 + }, + { + "epoch": 1.1270830683119195, + "grad_norm": 1.5278695821762085, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8771054148674011, + "num_tokens": 337925911.0, + "step": 8860 + }, + { + "epoch": 1.12721027859051, + "grad_norm": 1.496010422706604, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8791347742080688, + "num_tokens": 337964156.0, + "step": 8861 + }, + { + "epoch": 1.1273374888691006, + "grad_norm": 1.4921578168869019, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8767098188400269, + "num_tokens": 338003066.0, + "step": 8862 + }, + { + "epoch": 1.127464699147691, + "grad_norm": 1.6204651594161987, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8578910827636719, + "num_tokens": 338040909.0, + "step": 8863 + }, + { + "epoch": 1.1275919094262816, + "grad_norm": 1.5502127408981323, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8661871552467346, + "num_tokens": 338083145.0, + "step": 8864 + }, + { + "epoch": 1.1277191197048722, + "grad_norm": 1.3994165658950806, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8918513655662537, + "num_tokens": 338127624.0, + "step": 8865 + }, + { + "epoch": 1.1278463299834627, + "grad_norm": 1.5084829330444336, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8665608167648315, + "num_tokens": 338165562.0, + "step": 8866 + }, + { + "epoch": 1.1279735402620532, + "grad_norm": 1.5028218030929565, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.874836266040802, + "num_tokens": 338205906.0, + "step": 8867 + }, + { + "epoch": 1.1281007505406437, + "grad_norm": 1.5043368339538574, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.88381427526474, + "num_tokens": 338248020.0, + "step": 8868 + }, + { + "epoch": 1.1282279608192343, + "grad_norm": 1.5698704719543457, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8616093397140503, + "num_tokens": 338291702.0, + "step": 8869 + }, + { + "epoch": 1.1283551710978248, + "grad_norm": 1.5800809860229492, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8869115114212036, + "num_tokens": 338325951.0, + "step": 8870 + }, + { + "epoch": 1.1284823813764153, + "grad_norm": 1.5223480463027954, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8679270148277283, + "num_tokens": 338366206.0, + "step": 8871 + }, + { + "epoch": 1.1286095916550056, + "grad_norm": 1.4296036958694458, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8750938177108765, + "num_tokens": 338408022.0, + "step": 8872 + }, + { + "epoch": 1.1287368019335962, + "grad_norm": 1.5315783023834229, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.878545880317688, + "num_tokens": 338443892.0, + "step": 8873 + }, + { + "epoch": 1.1288640122121867, + "grad_norm": 1.539832353591919, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8557292222976685, + "num_tokens": 338486445.0, + "step": 8874 + }, + { + "epoch": 1.1289912224907772, + "grad_norm": 1.5025155544281006, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8722659349441528, + "num_tokens": 338521236.0, + "step": 8875 + }, + { + "epoch": 1.1291184327693677, + "grad_norm": 1.55752432346344, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.878359317779541, + "num_tokens": 338556976.0, + "step": 8876 + }, + { + "epoch": 1.1292456430479583, + "grad_norm": 1.4692810773849487, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8754075169563293, + "num_tokens": 338596443.0, + "step": 8877 + }, + { + "epoch": 1.1293728533265488, + "grad_norm": 1.4891841411590576, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8841378688812256, + "num_tokens": 338633748.0, + "step": 8878 + }, + { + "epoch": 1.1295000636051393, + "grad_norm": 1.5632176399230957, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.88253253698349, + "num_tokens": 338672345.0, + "step": 8879 + }, + { + "epoch": 1.1296272738837299, + "grad_norm": 1.4673166275024414, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8870953917503357, + "num_tokens": 338707279.0, + "step": 8880 + }, + { + "epoch": 1.1297544841623204, + "grad_norm": 1.5854301452636719, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8811243176460266, + "num_tokens": 338739666.0, + "step": 8881 + }, + { + "epoch": 1.129881694440911, + "grad_norm": 1.5589045286178589, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8727996349334717, + "num_tokens": 338775714.0, + "step": 8882 + }, + { + "epoch": 1.1300089047195012, + "grad_norm": 1.4654221534729004, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8619391918182373, + "num_tokens": 338819734.0, + "step": 8883 + }, + { + "epoch": 1.1301361149980917, + "grad_norm": 1.5034209489822388, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8820549249649048, + "num_tokens": 338859068.0, + "step": 8884 + }, + { + "epoch": 1.1302633252766823, + "grad_norm": 1.4846417903900146, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8698266744613647, + "num_tokens": 338899211.0, + "step": 8885 + }, + { + "epoch": 1.1303905355552728, + "grad_norm": 1.7245370149612427, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8560526371002197, + "num_tokens": 338935364.0, + "step": 8886 + }, + { + "epoch": 1.1305177458338633, + "grad_norm": 1.6209453344345093, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8616153597831726, + "num_tokens": 338969303.0, + "step": 8887 + }, + { + "epoch": 1.1306449561124539, + "grad_norm": 1.5135464668273926, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8740124702453613, + "num_tokens": 339009480.0, + "step": 8888 + }, + { + "epoch": 1.1307721663910444, + "grad_norm": 1.6072849035263062, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8670682907104492, + "num_tokens": 339046336.0, + "step": 8889 + }, + { + "epoch": 1.130899376669635, + "grad_norm": 1.5130767822265625, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8666533827781677, + "num_tokens": 339085302.0, + "step": 8890 + }, + { + "epoch": 1.1310265869482254, + "grad_norm": 1.4540461301803589, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8796123266220093, + "num_tokens": 339123635.0, + "step": 8891 + }, + { + "epoch": 1.131153797226816, + "grad_norm": 1.536494255065918, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8747304677963257, + "num_tokens": 339161874.0, + "step": 8892 + }, + { + "epoch": 1.1312810075054065, + "grad_norm": 1.5259300470352173, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8875039219856262, + "num_tokens": 339193974.0, + "step": 8893 + }, + { + "epoch": 1.131408217783997, + "grad_norm": 1.5700889825820923, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.867313027381897, + "num_tokens": 339232472.0, + "step": 8894 + }, + { + "epoch": 1.1315354280625876, + "grad_norm": 1.554712176322937, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8760862350463867, + "num_tokens": 339273242.0, + "step": 8895 + }, + { + "epoch": 1.131662638341178, + "grad_norm": 1.4744534492492676, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8775556683540344, + "num_tokens": 339314108.0, + "step": 8896 + }, + { + "epoch": 1.1317898486197684, + "grad_norm": 1.5181207656860352, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8658841848373413, + "num_tokens": 339354652.0, + "step": 8897 + }, + { + "epoch": 1.131917058898359, + "grad_norm": 1.5448073148727417, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8709774017333984, + "num_tokens": 339391811.0, + "step": 8898 + }, + { + "epoch": 1.1320442691769494, + "grad_norm": 1.3647581338882446, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8738641142845154, + "num_tokens": 339438226.0, + "step": 8899 + }, + { + "epoch": 1.13217147945554, + "grad_norm": 1.4248815774917603, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8803491592407227, + "num_tokens": 339480547.0, + "step": 8900 + }, + { + "epoch": 1.1322986897341305, + "grad_norm": 1.523484230041504, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8601322174072266, + "num_tokens": 339519479.0, + "step": 8901 + }, + { + "epoch": 1.132425900012721, + "grad_norm": 1.599610686302185, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8716704845428467, + "num_tokens": 339553336.0, + "step": 8902 + }, + { + "epoch": 1.1325531102913116, + "grad_norm": 1.4606871604919434, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8861426115036011, + "num_tokens": 339593657.0, + "step": 8903 + }, + { + "epoch": 1.132680320569902, + "grad_norm": 1.5236977338790894, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8711147308349609, + "num_tokens": 339634201.0, + "step": 8904 + }, + { + "epoch": 1.1328075308484926, + "grad_norm": 1.5736637115478516, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8749038577079773, + "num_tokens": 339672711.0, + "step": 8905 + }, + { + "epoch": 1.1329347411270831, + "grad_norm": 1.5217939615249634, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8741400241851807, + "num_tokens": 339708283.0, + "step": 8906 + }, + { + "epoch": 1.1330619514056737, + "grad_norm": 1.4659438133239746, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8889889717102051, + "num_tokens": 339743541.0, + "step": 8907 + }, + { + "epoch": 1.133189161684264, + "grad_norm": 1.5339787006378174, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8693946599960327, + "num_tokens": 339781348.0, + "step": 8908 + }, + { + "epoch": 1.1333163719628545, + "grad_norm": 1.610504150390625, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8742064237594604, + "num_tokens": 339814250.0, + "step": 8909 + }, + { + "epoch": 1.133443582241445, + "grad_norm": 1.452128291130066, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8767678737640381, + "num_tokens": 339857720.0, + "step": 8910 + }, + { + "epoch": 1.1335707925200356, + "grad_norm": 1.4141523838043213, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8806527853012085, + "num_tokens": 339898664.0, + "step": 8911 + }, + { + "epoch": 1.133698002798626, + "grad_norm": 1.4444299936294556, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.887184202671051, + "num_tokens": 339937281.0, + "step": 8912 + }, + { + "epoch": 1.1338252130772166, + "grad_norm": 1.3851525783538818, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8872672319412231, + "num_tokens": 339977100.0, + "step": 8913 + }, + { + "epoch": 1.1339524233558071, + "grad_norm": 1.5790538787841797, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8633096814155579, + "num_tokens": 340014933.0, + "step": 8914 + }, + { + "epoch": 1.1340796336343977, + "grad_norm": 1.861431360244751, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8591740727424622, + "num_tokens": 340048954.0, + "step": 8915 + }, + { + "epoch": 1.1342068439129882, + "grad_norm": 1.4989908933639526, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.868056058883667, + "num_tokens": 340089084.0, + "step": 8916 + }, + { + "epoch": 1.1343340541915787, + "grad_norm": 1.4478647708892822, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8577510118484497, + "num_tokens": 340134177.0, + "step": 8917 + }, + { + "epoch": 1.1344612644701693, + "grad_norm": 1.6264872550964355, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8694620132446289, + "num_tokens": 340170856.0, + "step": 8918 + }, + { + "epoch": 1.1345884747487598, + "grad_norm": 1.4579871892929077, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8734338283538818, + "num_tokens": 340212593.0, + "step": 8919 + }, + { + "epoch": 1.1347156850273503, + "grad_norm": 1.533206582069397, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8794773817062378, + "num_tokens": 340249791.0, + "step": 8920 + }, + { + "epoch": 1.1348428953059406, + "grad_norm": 1.534467339515686, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8612924814224243, + "num_tokens": 340288874.0, + "step": 8921 + }, + { + "epoch": 1.1349701055845312, + "grad_norm": 1.3177576065063477, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8993387222290039, + "num_tokens": 340330384.0, + "step": 8922 + }, + { + "epoch": 1.1350973158631217, + "grad_norm": 1.5510362386703491, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8712689876556396, + "num_tokens": 340368586.0, + "step": 8923 + }, + { + "epoch": 1.1352245261417122, + "grad_norm": 1.6091244220733643, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.874911904335022, + "num_tokens": 340403033.0, + "step": 8924 + }, + { + "epoch": 1.1353517364203027, + "grad_norm": 1.5733699798583984, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8891922831535339, + "num_tokens": 340437611.0, + "step": 8925 + }, + { + "epoch": 1.1354789466988933, + "grad_norm": 1.4123927354812622, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8810526132583618, + "num_tokens": 340479917.0, + "step": 8926 + }, + { + "epoch": 1.1356061569774838, + "grad_norm": 1.3720656633377075, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8834540247917175, + "num_tokens": 340521803.0, + "step": 8927 + }, + { + "epoch": 1.1357333672560743, + "grad_norm": 1.4896999597549438, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8838964104652405, + "num_tokens": 340557246.0, + "step": 8928 + }, + { + "epoch": 1.1358605775346649, + "grad_norm": 1.6014294624328613, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.869266152381897, + "num_tokens": 340594139.0, + "step": 8929 + }, + { + "epoch": 1.1359877878132554, + "grad_norm": 1.4149506092071533, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8780248165130615, + "num_tokens": 340639372.0, + "step": 8930 + }, + { + "epoch": 1.136114998091846, + "grad_norm": 1.596011757850647, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8834551572799683, + "num_tokens": 340676990.0, + "step": 8931 + }, + { + "epoch": 1.1362422083704362, + "grad_norm": 1.4712365865707397, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8693498373031616, + "num_tokens": 340721864.0, + "step": 8932 + }, + { + "epoch": 1.1363694186490267, + "grad_norm": 1.5024123191833496, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8778604865074158, + "num_tokens": 340759841.0, + "step": 8933 + }, + { + "epoch": 1.1364966289276173, + "grad_norm": 1.5924675464630127, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.880849301815033, + "num_tokens": 340796909.0, + "step": 8934 + }, + { + "epoch": 1.1366238392062078, + "grad_norm": 1.4250344038009644, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8879212141036987, + "num_tokens": 340837816.0, + "step": 8935 + }, + { + "epoch": 1.1367510494847983, + "grad_norm": 1.5038294792175293, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8715232610702515, + "num_tokens": 340876338.0, + "step": 8936 + }, + { + "epoch": 1.1368782597633889, + "grad_norm": 1.5518670082092285, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8684926629066467, + "num_tokens": 340913151.0, + "step": 8937 + }, + { + "epoch": 1.1370054700419794, + "grad_norm": 1.5007867813110352, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8822852373123169, + "num_tokens": 340950829.0, + "step": 8938 + }, + { + "epoch": 1.13713268032057, + "grad_norm": 1.400496006011963, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8741505146026611, + "num_tokens": 340996026.0, + "step": 8939 + }, + { + "epoch": 1.1372598905991604, + "grad_norm": 1.4951810836791992, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8757240176200867, + "num_tokens": 341034746.0, + "step": 8940 + }, + { + "epoch": 1.137387100877751, + "grad_norm": 1.624175786972046, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8764908313751221, + "num_tokens": 341071138.0, + "step": 8941 + }, + { + "epoch": 1.1375143111563415, + "grad_norm": 1.5995055437088013, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8669570684432983, + "num_tokens": 341106699.0, + "step": 8942 + }, + { + "epoch": 1.137641521434932, + "grad_norm": 1.4845836162567139, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.882919192314148, + "num_tokens": 341144093.0, + "step": 8943 + }, + { + "epoch": 1.1377687317135226, + "grad_norm": 1.5835262537002563, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8675940036773682, + "num_tokens": 341182240.0, + "step": 8944 + }, + { + "epoch": 1.137895941992113, + "grad_norm": 1.5696253776550293, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8687901496887207, + "num_tokens": 341217065.0, + "step": 8945 + }, + { + "epoch": 1.1380231522707034, + "grad_norm": 1.5051244497299194, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8785890340805054, + "num_tokens": 341257284.0, + "step": 8946 + }, + { + "epoch": 1.138150362549294, + "grad_norm": 1.4552338123321533, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8798288106918335, + "num_tokens": 341294339.0, + "step": 8947 + }, + { + "epoch": 1.1382775728278844, + "grad_norm": 1.5333576202392578, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8561164736747742, + "num_tokens": 341340484.0, + "step": 8948 + }, + { + "epoch": 1.138404783106475, + "grad_norm": 1.4172614812850952, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8873311281204224, + "num_tokens": 341383396.0, + "step": 8949 + }, + { + "epoch": 1.1385319933850655, + "grad_norm": 1.5988866090774536, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8877672553062439, + "num_tokens": 341417517.0, + "step": 8950 + }, + { + "epoch": 1.138659203663656, + "grad_norm": 1.522204041481018, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8713014721870422, + "num_tokens": 341456855.0, + "step": 8951 + }, + { + "epoch": 1.1387864139422466, + "grad_norm": 1.454338788986206, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.879461407661438, + "num_tokens": 341497720.0, + "step": 8952 + }, + { + "epoch": 1.138913624220837, + "grad_norm": 1.6257175207138062, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8540216684341431, + "num_tokens": 341537712.0, + "step": 8953 + }, + { + "epoch": 1.1390408344994276, + "grad_norm": 1.6357442140579224, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8737409114837646, + "num_tokens": 341571600.0, + "step": 8954 + }, + { + "epoch": 1.1391680447780181, + "grad_norm": 1.3946162462234497, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8753890991210938, + "num_tokens": 341617100.0, + "step": 8955 + }, + { + "epoch": 1.1392952550566087, + "grad_norm": 1.5010358095169067, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8852607011795044, + "num_tokens": 341653247.0, + "step": 8956 + }, + { + "epoch": 1.139422465335199, + "grad_norm": 1.6611554622650146, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8708144426345825, + "num_tokens": 341687734.0, + "step": 8957 + }, + { + "epoch": 1.1395496756137895, + "grad_norm": 1.8460547924041748, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8786698579788208, + "num_tokens": 341718920.0, + "step": 8958 + }, + { + "epoch": 1.13967688589238, + "grad_norm": 1.4144161939620972, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8656112551689148, + "num_tokens": 341763571.0, + "step": 8959 + }, + { + "epoch": 1.1398040961709706, + "grad_norm": 1.640904188156128, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8697613477706909, + "num_tokens": 341797074.0, + "step": 8960 + }, + { + "epoch": 1.139931306449561, + "grad_norm": 1.5441269874572754, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8753249645233154, + "num_tokens": 341833819.0, + "step": 8961 + }, + { + "epoch": 1.1400585167281516, + "grad_norm": 1.6363461017608643, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8781196475028992, + "num_tokens": 341864432.0, + "step": 8962 + }, + { + "epoch": 1.1401857270067421, + "grad_norm": 1.476045846939087, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8748570680618286, + "num_tokens": 341905334.0, + "step": 8963 + }, + { + "epoch": 1.1403129372853327, + "grad_norm": 1.54122793674469, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8645920753479004, + "num_tokens": 341947280.0, + "step": 8964 + }, + { + "epoch": 1.1404401475639232, + "grad_norm": 1.5935014486312866, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.880056619644165, + "num_tokens": 341982794.0, + "step": 8965 + }, + { + "epoch": 1.1405673578425137, + "grad_norm": 1.5637142658233643, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8726478815078735, + "num_tokens": 342023909.0, + "step": 8966 + }, + { + "epoch": 1.1406945681211043, + "grad_norm": 1.6389570236206055, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8643983602523804, + "num_tokens": 342060984.0, + "step": 8967 + }, + { + "epoch": 1.1408217783996948, + "grad_norm": 1.4442386627197266, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8778116703033447, + "num_tokens": 342100186.0, + "step": 8968 + }, + { + "epoch": 1.1409489886782853, + "grad_norm": 1.546465277671814, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8614283800125122, + "num_tokens": 342141844.0, + "step": 8969 + }, + { + "epoch": 1.1410761989568756, + "grad_norm": 1.5321104526519775, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.869202733039856, + "num_tokens": 342180811.0, + "step": 8970 + }, + { + "epoch": 1.1412034092354661, + "grad_norm": 1.570658564567566, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8865955471992493, + "num_tokens": 342213342.0, + "step": 8971 + }, + { + "epoch": 1.1413306195140567, + "grad_norm": 1.508281946182251, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.8970913290977478, + "num_tokens": 342249655.0, + "step": 8972 + }, + { + "epoch": 1.1414578297926472, + "grad_norm": 1.7289751768112183, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8679032921791077, + "num_tokens": 342283179.0, + "step": 8973 + }, + { + "epoch": 1.1415850400712377, + "grad_norm": 1.4748554229736328, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8781919479370117, + "num_tokens": 342322675.0, + "step": 8974 + }, + { + "epoch": 1.1417122503498283, + "grad_norm": 1.4259870052337646, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8735790848731995, + "num_tokens": 342363405.0, + "step": 8975 + }, + { + "epoch": 1.1418394606284188, + "grad_norm": 1.4765698909759521, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8775978088378906, + "num_tokens": 342406139.0, + "step": 8976 + }, + { + "epoch": 1.1419666709070093, + "grad_norm": 1.534095048904419, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8847914934158325, + "num_tokens": 342444696.0, + "step": 8977 + }, + { + "epoch": 1.1420938811855998, + "grad_norm": 1.540123462677002, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8770974278450012, + "num_tokens": 342479128.0, + "step": 8978 + }, + { + "epoch": 1.1422210914641904, + "grad_norm": 1.802822232246399, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.868603527545929, + "num_tokens": 342512374.0, + "step": 8979 + }, + { + "epoch": 1.142348301742781, + "grad_norm": 1.6048634052276611, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.883051872253418, + "num_tokens": 342546319.0, + "step": 8980 + }, + { + "epoch": 1.1424755120213712, + "grad_norm": 1.5077974796295166, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8837253451347351, + "num_tokens": 342585755.0, + "step": 8981 + }, + { + "epoch": 1.1426027222999617, + "grad_norm": 1.4357843399047852, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8862409591674805, + "num_tokens": 342626113.0, + "step": 8982 + }, + { + "epoch": 1.1427299325785523, + "grad_norm": 1.4766085147857666, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8700153827667236, + "num_tokens": 342664543.0, + "step": 8983 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 1.487781286239624, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8747813701629639, + "num_tokens": 342704969.0, + "step": 8984 + }, + { + "epoch": 1.1429843531357333, + "grad_norm": 1.6375718116760254, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8777633905410767, + "num_tokens": 342739486.0, + "step": 8985 + }, + { + "epoch": 1.1431115634143239, + "grad_norm": 1.4361318349838257, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8685444593429565, + "num_tokens": 342781310.0, + "step": 8986 + }, + { + "epoch": 1.1432387736929144, + "grad_norm": 1.5878992080688477, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.875592052936554, + "num_tokens": 342815873.0, + "step": 8987 + }, + { + "epoch": 1.143365983971505, + "grad_norm": 1.4916951656341553, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8882681727409363, + "num_tokens": 342851021.0, + "step": 8988 + }, + { + "epoch": 1.1434931942500954, + "grad_norm": 1.6329553127288818, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8661156296730042, + "num_tokens": 342890374.0, + "step": 8989 + }, + { + "epoch": 1.143620404528686, + "grad_norm": 1.5638799667358398, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8695230484008789, + "num_tokens": 342927845.0, + "step": 8990 + }, + { + "epoch": 1.1437476148072765, + "grad_norm": 1.595686912536621, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8556437492370605, + "num_tokens": 342967852.0, + "step": 8991 + }, + { + "epoch": 1.143874825085867, + "grad_norm": 1.4609400033950806, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.882480800151825, + "num_tokens": 343007481.0, + "step": 8992 + }, + { + "epoch": 1.1440020353644575, + "grad_norm": 1.5305068492889404, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8889926671981812, + "num_tokens": 343042000.0, + "step": 8993 + }, + { + "epoch": 1.144129245643048, + "grad_norm": 1.4043570756912231, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8838512301445007, + "num_tokens": 343084934.0, + "step": 8994 + }, + { + "epoch": 1.1442564559216384, + "grad_norm": 1.4002209901809692, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8790664076805115, + "num_tokens": 343127332.0, + "step": 8995 + }, + { + "epoch": 1.144383666200229, + "grad_norm": 1.3778002262115479, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.879011869430542, + "num_tokens": 343171836.0, + "step": 8996 + }, + { + "epoch": 1.1445108764788194, + "grad_norm": 1.5484296083450317, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8625845909118652, + "num_tokens": 343210376.0, + "step": 8997 + }, + { + "epoch": 1.14463808675741, + "grad_norm": 1.4012318849563599, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8890621662139893, + "num_tokens": 343251637.0, + "step": 8998 + }, + { + "epoch": 1.1447652970360005, + "grad_norm": 1.4728277921676636, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8809389472007751, + "num_tokens": 343290632.0, + "step": 8999 + }, + { + "epoch": 1.144892507314591, + "grad_norm": 1.4233170747756958, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8923869132995605, + "num_tokens": 343331239.0, + "step": 9000 + }, + { + "epoch": 1.1450197175931816, + "grad_norm": 1.3781671524047852, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8920083045959473, + "num_tokens": 343368845.0, + "step": 9001 + }, + { + "epoch": 1.145146927871772, + "grad_norm": 1.4823073148727417, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8841761350631714, + "num_tokens": 343405806.0, + "step": 9002 + }, + { + "epoch": 1.1452741381503626, + "grad_norm": 1.5417572259902954, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8716650009155273, + "num_tokens": 343444224.0, + "step": 9003 + }, + { + "epoch": 1.1454013484289531, + "grad_norm": 1.4963364601135254, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8708559274673462, + "num_tokens": 343486614.0, + "step": 9004 + }, + { + "epoch": 1.1455285587075437, + "grad_norm": 1.5016168355941772, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8830083608627319, + "num_tokens": 343526580.0, + "step": 9005 + }, + { + "epoch": 1.145655768986134, + "grad_norm": 1.5221179723739624, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8592928647994995, + "num_tokens": 343566560.0, + "step": 9006 + }, + { + "epoch": 1.1457829792647245, + "grad_norm": 1.5755515098571777, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8803601264953613, + "num_tokens": 343601978.0, + "step": 9007 + }, + { + "epoch": 1.145910189543315, + "grad_norm": 1.6116968393325806, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8646301627159119, + "num_tokens": 343641845.0, + "step": 9008 + }, + { + "epoch": 1.1460373998219056, + "grad_norm": 1.6284384727478027, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8751856684684753, + "num_tokens": 343677755.0, + "step": 9009 + }, + { + "epoch": 1.146164610100496, + "grad_norm": 1.5460489988327026, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8646255731582642, + "num_tokens": 343715221.0, + "step": 9010 + }, + { + "epoch": 1.1462918203790866, + "grad_norm": 1.427384853363037, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8597844839096069, + "num_tokens": 343760875.0, + "step": 9011 + }, + { + "epoch": 1.1464190306576771, + "grad_norm": 1.4596134424209595, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8805593252182007, + "num_tokens": 343800828.0, + "step": 9012 + }, + { + "epoch": 1.1465462409362677, + "grad_norm": 1.6053361892700195, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8609976768493652, + "num_tokens": 343835824.0, + "step": 9013 + }, + { + "epoch": 1.1466734512148582, + "grad_norm": 1.5994255542755127, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8519445657730103, + "num_tokens": 343879484.0, + "step": 9014 + }, + { + "epoch": 1.1468006614934487, + "grad_norm": 1.4353930950164795, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8679195642471313, + "num_tokens": 343923811.0, + "step": 9015 + }, + { + "epoch": 1.1469278717720393, + "grad_norm": 1.6487799882888794, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.850017786026001, + "num_tokens": 343961618.0, + "step": 9016 + }, + { + "epoch": 1.1470550820506298, + "grad_norm": 1.4567668437957764, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.881842315196991, + "num_tokens": 344002871.0, + "step": 9017 + }, + { + "epoch": 1.1471822923292203, + "grad_norm": 1.4094995260238647, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8785915374755859, + "num_tokens": 344044895.0, + "step": 9018 + }, + { + "epoch": 1.1473095026078106, + "grad_norm": 1.5428929328918457, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8745440244674683, + "num_tokens": 344079129.0, + "step": 9019 + }, + { + "epoch": 1.1474367128864011, + "grad_norm": 1.4690533876419067, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8604357242584229, + "num_tokens": 344124361.0, + "step": 9020 + }, + { + "epoch": 1.1475639231649917, + "grad_norm": 1.5005104541778564, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8908820152282715, + "num_tokens": 344158061.0, + "step": 9021 + }, + { + "epoch": 1.1476911334435822, + "grad_norm": 1.485215663909912, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8816536664962769, + "num_tokens": 344197170.0, + "step": 9022 + }, + { + "epoch": 1.1478183437221727, + "grad_norm": 1.4799755811691284, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8652482628822327, + "num_tokens": 344237565.0, + "step": 9023 + }, + { + "epoch": 1.1479455540007633, + "grad_norm": 1.4174641370773315, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8842467665672302, + "num_tokens": 344282594.0, + "step": 9024 + }, + { + "epoch": 1.1480727642793538, + "grad_norm": 1.5719635486602783, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8789503574371338, + "num_tokens": 344318246.0, + "step": 9025 + }, + { + "epoch": 1.1481999745579443, + "grad_norm": 1.3451600074768066, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.894875168800354, + "num_tokens": 344359874.0, + "step": 9026 + }, + { + "epoch": 1.1483271848365348, + "grad_norm": 1.413031816482544, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8880370855331421, + "num_tokens": 344398595.0, + "step": 9027 + }, + { + "epoch": 1.1484543951151254, + "grad_norm": 1.5825557708740234, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8687722682952881, + "num_tokens": 344440375.0, + "step": 9028 + }, + { + "epoch": 1.148581605393716, + "grad_norm": 1.501014232635498, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8810387849807739, + "num_tokens": 344478134.0, + "step": 9029 + }, + { + "epoch": 1.1487088156723062, + "grad_norm": 1.638091802597046, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8601174354553223, + "num_tokens": 344512340.0, + "step": 9030 + }, + { + "epoch": 1.1488360259508967, + "grad_norm": 1.6337494850158691, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.876056432723999, + "num_tokens": 344548486.0, + "step": 9031 + }, + { + "epoch": 1.1489632362294873, + "grad_norm": 1.4606314897537231, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8730870485305786, + "num_tokens": 344588139.0, + "step": 9032 + }, + { + "epoch": 1.1490904465080778, + "grad_norm": 1.7553114891052246, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8676021695137024, + "num_tokens": 344621510.0, + "step": 9033 + }, + { + "epoch": 1.1492176567866683, + "grad_norm": 1.587256669998169, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8715759515762329, + "num_tokens": 344657886.0, + "step": 9034 + }, + { + "epoch": 1.1493448670652588, + "grad_norm": 1.539338231086731, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8738281726837158, + "num_tokens": 344695931.0, + "step": 9035 + }, + { + "epoch": 1.1494720773438494, + "grad_norm": 1.616716742515564, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.87959885597229, + "num_tokens": 344733742.0, + "step": 9036 + }, + { + "epoch": 1.14959928762244, + "grad_norm": 1.4989978075027466, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8602659106254578, + "num_tokens": 344777457.0, + "step": 9037 + }, + { + "epoch": 1.1497264979010304, + "grad_norm": 1.469001293182373, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8760911226272583, + "num_tokens": 344814628.0, + "step": 9038 + }, + { + "epoch": 1.149853708179621, + "grad_norm": 1.5055088996887207, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.870977520942688, + "num_tokens": 344857946.0, + "step": 9039 + }, + { + "epoch": 1.1499809184582115, + "grad_norm": 1.5624898672103882, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8697643280029297, + "num_tokens": 344894565.0, + "step": 9040 + }, + { + "epoch": 1.150108128736802, + "grad_norm": 1.4795671701431274, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8823274374008179, + "num_tokens": 344935817.0, + "step": 9041 + }, + { + "epoch": 1.1502353390153925, + "grad_norm": 1.5068851709365845, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8732537031173706, + "num_tokens": 344974923.0, + "step": 9042 + }, + { + "epoch": 1.150362549293983, + "grad_norm": 1.596782922744751, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8635954856872559, + "num_tokens": 345013574.0, + "step": 9043 + }, + { + "epoch": 1.1504897595725734, + "grad_norm": 1.4170037508010864, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.870994508266449, + "num_tokens": 345056815.0, + "step": 9044 + }, + { + "epoch": 1.150616969851164, + "grad_norm": 1.7779937982559204, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8651413917541504, + "num_tokens": 345095500.0, + "step": 9045 + }, + { + "epoch": 1.1507441801297544, + "grad_norm": 1.5716098546981812, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8764001727104187, + "num_tokens": 345131663.0, + "step": 9046 + }, + { + "epoch": 1.150871390408345, + "grad_norm": 1.566177487373352, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8739328980445862, + "num_tokens": 345167157.0, + "step": 9047 + }, + { + "epoch": 1.1509986006869355, + "grad_norm": 1.4839212894439697, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.876831591129303, + "num_tokens": 345203630.0, + "step": 9048 + }, + { + "epoch": 1.151125810965526, + "grad_norm": 1.5034525394439697, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8592168092727661, + "num_tokens": 345248245.0, + "step": 9049 + }, + { + "epoch": 1.1512530212441165, + "grad_norm": 1.5512542724609375, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8694769144058228, + "num_tokens": 345286322.0, + "step": 9050 + }, + { + "epoch": 1.151380231522707, + "grad_norm": 1.5301730632781982, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8717085123062134, + "num_tokens": 345323609.0, + "step": 9051 + }, + { + "epoch": 1.1515074418012976, + "grad_norm": 1.5257377624511719, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8813138604164124, + "num_tokens": 345358973.0, + "step": 9052 + }, + { + "epoch": 1.1516346520798881, + "grad_norm": 1.423252820968628, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8692983388900757, + "num_tokens": 345398676.0, + "step": 9053 + }, + { + "epoch": 1.1517618623584787, + "grad_norm": 1.4757963418960571, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8884285092353821, + "num_tokens": 345436066.0, + "step": 9054 + }, + { + "epoch": 1.151889072637069, + "grad_norm": 1.5062830448150635, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8730079531669617, + "num_tokens": 345472115.0, + "step": 9055 + }, + { + "epoch": 1.1520162829156595, + "grad_norm": 1.4335534572601318, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8856750726699829, + "num_tokens": 345512861.0, + "step": 9056 + }, + { + "epoch": 1.15214349319425, + "grad_norm": 1.6885885000228882, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8576575517654419, + "num_tokens": 345551000.0, + "step": 9057 + }, + { + "epoch": 1.1522707034728406, + "grad_norm": 1.45802640914917, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8679393529891968, + "num_tokens": 345591567.0, + "step": 9058 + }, + { + "epoch": 1.152397913751431, + "grad_norm": 1.5092101097106934, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8665366172790527, + "num_tokens": 345634190.0, + "step": 9059 + }, + { + "epoch": 1.1525251240300216, + "grad_norm": 1.5276331901550293, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.861296534538269, + "num_tokens": 345675253.0, + "step": 9060 + }, + { + "epoch": 1.1526523343086121, + "grad_norm": 1.4376620054244995, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8720183968544006, + "num_tokens": 345716664.0, + "step": 9061 + }, + { + "epoch": 1.1527795445872027, + "grad_norm": 1.5938441753387451, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8650107979774475, + "num_tokens": 345752929.0, + "step": 9062 + }, + { + "epoch": 1.1529067548657932, + "grad_norm": 1.5212458372116089, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8727145791053772, + "num_tokens": 345791001.0, + "step": 9063 + }, + { + "epoch": 1.1530339651443837, + "grad_norm": 1.5670559406280518, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.880718469619751, + "num_tokens": 345824498.0, + "step": 9064 + }, + { + "epoch": 1.1531611754229742, + "grad_norm": 1.5938220024108887, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8713200092315674, + "num_tokens": 345864282.0, + "step": 9065 + }, + { + "epoch": 1.1532883857015648, + "grad_norm": 1.5758416652679443, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8789855241775513, + "num_tokens": 345897625.0, + "step": 9066 + }, + { + "epoch": 1.1534155959801553, + "grad_norm": 1.459101915359497, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8908088207244873, + "num_tokens": 345936238.0, + "step": 9067 + }, + { + "epoch": 1.1535428062587456, + "grad_norm": 1.6543004512786865, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8721276521682739, + "num_tokens": 345970638.0, + "step": 9068 + }, + { + "epoch": 1.1536700165373361, + "grad_norm": 1.8015393018722534, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8671359419822693, + "num_tokens": 346010709.0, + "step": 9069 + }, + { + "epoch": 1.1537972268159267, + "grad_norm": 1.35426664352417, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8819348812103271, + "num_tokens": 346055482.0, + "step": 9070 + }, + { + "epoch": 1.1539244370945172, + "grad_norm": 1.5919697284698486, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.873361349105835, + "num_tokens": 346095623.0, + "step": 9071 + }, + { + "epoch": 1.1540516473731077, + "grad_norm": 1.5205425024032593, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8661048412322998, + "num_tokens": 346135748.0, + "step": 9072 + }, + { + "epoch": 1.1541788576516983, + "grad_norm": 1.6020805835723877, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8733183145523071, + "num_tokens": 346170428.0, + "step": 9073 + }, + { + "epoch": 1.1543060679302888, + "grad_norm": 1.5554171800613403, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8660867810249329, + "num_tokens": 346209382.0, + "step": 9074 + }, + { + "epoch": 1.1544332782088793, + "grad_norm": 1.492291808128357, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8603941798210144, + "num_tokens": 346250459.0, + "step": 9075 + }, + { + "epoch": 1.1545604884874698, + "grad_norm": 1.4965312480926514, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8709539175033569, + "num_tokens": 346294028.0, + "step": 9076 + }, + { + "epoch": 1.1546876987660604, + "grad_norm": 1.4548909664154053, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8740375638008118, + "num_tokens": 346331905.0, + "step": 9077 + }, + { + "epoch": 1.154814909044651, + "grad_norm": 1.457753300666809, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8880266547203064, + "num_tokens": 346370064.0, + "step": 9078 + }, + { + "epoch": 1.1549421193232412, + "grad_norm": 1.485083818435669, + "learning_rate": 1e-06, + "loss": 0.2815, + "mean_token_accuracy": 0.8952598571777344, + "num_tokens": 346404629.0, + "step": 9079 + }, + { + "epoch": 1.1550693296018317, + "grad_norm": 1.6525285243988037, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8676340579986572, + "num_tokens": 346439552.0, + "step": 9080 + }, + { + "epoch": 1.1551965398804223, + "grad_norm": 1.580918312072754, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8552998900413513, + "num_tokens": 346476456.0, + "step": 9081 + }, + { + "epoch": 1.1553237501590128, + "grad_norm": 1.5048391819000244, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8607569336891174, + "num_tokens": 346517672.0, + "step": 9082 + }, + { + "epoch": 1.1554509604376033, + "grad_norm": 1.58682119846344, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8668078184127808, + "num_tokens": 346556586.0, + "step": 9083 + }, + { + "epoch": 1.1555781707161938, + "grad_norm": 1.5180860757827759, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8820815086364746, + "num_tokens": 346598356.0, + "step": 9084 + }, + { + "epoch": 1.1557053809947844, + "grad_norm": 1.505062222480774, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8829975128173828, + "num_tokens": 346635402.0, + "step": 9085 + }, + { + "epoch": 1.155832591273375, + "grad_norm": 1.5141230821609497, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8815567493438721, + "num_tokens": 346673896.0, + "step": 9086 + }, + { + "epoch": 1.1559598015519654, + "grad_norm": 1.628670573234558, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8776736855506897, + "num_tokens": 346706966.0, + "step": 9087 + }, + { + "epoch": 1.156087011830556, + "grad_norm": 1.4307578802108765, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8739809989929199, + "num_tokens": 346749228.0, + "step": 9088 + }, + { + "epoch": 1.1562142221091465, + "grad_norm": 1.6202646493911743, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8612362742424011, + "num_tokens": 346788499.0, + "step": 9089 + }, + { + "epoch": 1.156341432387737, + "grad_norm": 1.456817388534546, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.874535083770752, + "num_tokens": 346827546.0, + "step": 9090 + }, + { + "epoch": 1.1564686426663275, + "grad_norm": 1.494858741760254, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8765150308609009, + "num_tokens": 346865932.0, + "step": 9091 + }, + { + "epoch": 1.156595852944918, + "grad_norm": 1.6440256834030151, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8729729652404785, + "num_tokens": 346899910.0, + "step": 9092 + }, + { + "epoch": 1.1567230632235084, + "grad_norm": 1.5646641254425049, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.869289755821228, + "num_tokens": 346935176.0, + "step": 9093 + }, + { + "epoch": 1.156850273502099, + "grad_norm": 1.5456647872924805, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8819507360458374, + "num_tokens": 346970327.0, + "step": 9094 + }, + { + "epoch": 1.1569774837806894, + "grad_norm": 1.5197749137878418, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8868564367294312, + "num_tokens": 347006727.0, + "step": 9095 + }, + { + "epoch": 1.15710469405928, + "grad_norm": 1.3861024379730225, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8797155618667603, + "num_tokens": 347049894.0, + "step": 9096 + }, + { + "epoch": 1.1572319043378705, + "grad_norm": 1.472003698348999, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8745393753051758, + "num_tokens": 347093028.0, + "step": 9097 + }, + { + "epoch": 1.157359114616461, + "grad_norm": 1.4563205242156982, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8850814700126648, + "num_tokens": 347133850.0, + "step": 9098 + }, + { + "epoch": 1.1574863248950515, + "grad_norm": 1.6545292139053345, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8787711262702942, + "num_tokens": 347166684.0, + "step": 9099 + }, + { + "epoch": 1.157613535173642, + "grad_norm": 1.515095829963684, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8672119379043579, + "num_tokens": 347206909.0, + "step": 9100 + }, + { + "epoch": 1.1577407454522326, + "grad_norm": 1.4794272184371948, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.873999834060669, + "num_tokens": 347247514.0, + "step": 9101 + }, + { + "epoch": 1.1578679557308231, + "grad_norm": 1.7095729112625122, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8594732284545898, + "num_tokens": 347281418.0, + "step": 9102 + }, + { + "epoch": 1.1579951660094137, + "grad_norm": 1.5376986265182495, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8610564470291138, + "num_tokens": 347322188.0, + "step": 9103 + }, + { + "epoch": 1.158122376288004, + "grad_norm": 1.529273271560669, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8873385190963745, + "num_tokens": 347355845.0, + "step": 9104 + }, + { + "epoch": 1.1582495865665945, + "grad_norm": 1.6100605726242065, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8561649322509766, + "num_tokens": 347394666.0, + "step": 9105 + }, + { + "epoch": 1.158376796845185, + "grad_norm": 1.561704397201538, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8706262111663818, + "num_tokens": 347432125.0, + "step": 9106 + }, + { + "epoch": 1.1585040071237755, + "grad_norm": 1.4730302095413208, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8637561798095703, + "num_tokens": 347474716.0, + "step": 9107 + }, + { + "epoch": 1.158631217402366, + "grad_norm": 1.457024097442627, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8788335919380188, + "num_tokens": 347516709.0, + "step": 9108 + }, + { + "epoch": 1.1587584276809566, + "grad_norm": 1.4535361528396606, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8762876391410828, + "num_tokens": 347554934.0, + "step": 9109 + }, + { + "epoch": 1.1588856379595471, + "grad_norm": 1.5256264209747314, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8674433827400208, + "num_tokens": 347590546.0, + "step": 9110 + }, + { + "epoch": 1.1590128482381377, + "grad_norm": 1.471194863319397, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8750635385513306, + "num_tokens": 347630569.0, + "step": 9111 + }, + { + "epoch": 1.1591400585167282, + "grad_norm": 1.4965240955352783, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.871466338634491, + "num_tokens": 347670627.0, + "step": 9112 + }, + { + "epoch": 1.1592672687953187, + "grad_norm": 1.507369875907898, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8912139534950256, + "num_tokens": 347704956.0, + "step": 9113 + }, + { + "epoch": 1.1593944790739092, + "grad_norm": 1.451232671737671, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8781633377075195, + "num_tokens": 347745870.0, + "step": 9114 + }, + { + "epoch": 1.1595216893524998, + "grad_norm": 1.4772682189941406, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8805942535400391, + "num_tokens": 347783571.0, + "step": 9115 + }, + { + "epoch": 1.1596488996310903, + "grad_norm": 1.4345695972442627, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.861726701259613, + "num_tokens": 347829294.0, + "step": 9116 + }, + { + "epoch": 1.1597761099096806, + "grad_norm": 1.5207558870315552, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8644320964813232, + "num_tokens": 347869631.0, + "step": 9117 + }, + { + "epoch": 1.1599033201882711, + "grad_norm": 1.3484965562820435, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8904633522033691, + "num_tokens": 347915919.0, + "step": 9118 + }, + { + "epoch": 1.1600305304668617, + "grad_norm": 1.4658232927322388, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8595924377441406, + "num_tokens": 347960341.0, + "step": 9119 + }, + { + "epoch": 1.1601577407454522, + "grad_norm": 1.5317164659500122, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8652245998382568, + "num_tokens": 347998747.0, + "step": 9120 + }, + { + "epoch": 1.1602849510240427, + "grad_norm": 1.5177751779556274, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8645162582397461, + "num_tokens": 348037084.0, + "step": 9121 + }, + { + "epoch": 1.1604121613026332, + "grad_norm": 1.4194151163101196, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8808000087738037, + "num_tokens": 348077145.0, + "step": 9122 + }, + { + "epoch": 1.1605393715812238, + "grad_norm": 1.5364587306976318, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8657281398773193, + "num_tokens": 348116568.0, + "step": 9123 + }, + { + "epoch": 1.1606665818598143, + "grad_norm": 1.5496571063995361, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8625034689903259, + "num_tokens": 348154796.0, + "step": 9124 + }, + { + "epoch": 1.1607937921384048, + "grad_norm": 1.4716393947601318, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8683058023452759, + "num_tokens": 348195823.0, + "step": 9125 + }, + { + "epoch": 1.1609210024169954, + "grad_norm": 1.4742546081542969, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8890156745910645, + "num_tokens": 348231112.0, + "step": 9126 + }, + { + "epoch": 1.161048212695586, + "grad_norm": 1.468672752380371, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8839808702468872, + "num_tokens": 348272711.0, + "step": 9127 + }, + { + "epoch": 1.1611754229741762, + "grad_norm": 1.549788236618042, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8736679553985596, + "num_tokens": 348308048.0, + "step": 9128 + }, + { + "epoch": 1.1613026332527667, + "grad_norm": 1.5760499238967896, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8713150024414062, + "num_tokens": 348344315.0, + "step": 9129 + }, + { + "epoch": 1.1614298435313573, + "grad_norm": 1.3982752561569214, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8703997731208801, + "num_tokens": 348388693.0, + "step": 9130 + }, + { + "epoch": 1.1615570538099478, + "grad_norm": 1.4915194511413574, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8592323660850525, + "num_tokens": 348432943.0, + "step": 9131 + }, + { + "epoch": 1.1616842640885383, + "grad_norm": 1.579296588897705, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8872026801109314, + "num_tokens": 348468264.0, + "step": 9132 + }, + { + "epoch": 1.1618114743671288, + "grad_norm": 1.5280466079711914, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8861936330795288, + "num_tokens": 348503534.0, + "step": 9133 + }, + { + "epoch": 1.1619386846457194, + "grad_norm": 1.5683043003082275, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8889123201370239, + "num_tokens": 348537555.0, + "step": 9134 + }, + { + "epoch": 1.16206589492431, + "grad_norm": 1.6527988910675049, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8619927167892456, + "num_tokens": 348573579.0, + "step": 9135 + }, + { + "epoch": 1.1621931052029004, + "grad_norm": 1.5839289426803589, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8910812139511108, + "num_tokens": 348606353.0, + "step": 9136 + }, + { + "epoch": 1.162320315481491, + "grad_norm": 1.552451252937317, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8793407678604126, + "num_tokens": 348647170.0, + "step": 9137 + }, + { + "epoch": 1.1624475257600815, + "grad_norm": 1.5711610317230225, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8866300582885742, + "num_tokens": 348685669.0, + "step": 9138 + }, + { + "epoch": 1.162574736038672, + "grad_norm": 1.3724067211151123, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8910802602767944, + "num_tokens": 348727998.0, + "step": 9139 + }, + { + "epoch": 1.1627019463172625, + "grad_norm": 1.4278846979141235, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8824265599250793, + "num_tokens": 348768113.0, + "step": 9140 + }, + { + "epoch": 1.162829156595853, + "grad_norm": 1.461798071861267, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8762252330780029, + "num_tokens": 348808204.0, + "step": 9141 + }, + { + "epoch": 1.1629563668744434, + "grad_norm": 1.5032387971878052, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8809712529182434, + "num_tokens": 348844379.0, + "step": 9142 + }, + { + "epoch": 1.163083577153034, + "grad_norm": 1.6258330345153809, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8786816596984863, + "num_tokens": 348879003.0, + "step": 9143 + }, + { + "epoch": 1.1632107874316244, + "grad_norm": 1.5125616788864136, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8744626045227051, + "num_tokens": 348917696.0, + "step": 9144 + }, + { + "epoch": 1.163337997710215, + "grad_norm": 1.5222231149673462, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8801814317703247, + "num_tokens": 348954706.0, + "step": 9145 + }, + { + "epoch": 1.1634652079888055, + "grad_norm": 1.4350860118865967, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8697664737701416, + "num_tokens": 348997335.0, + "step": 9146 + }, + { + "epoch": 1.163592418267396, + "grad_norm": 1.5633230209350586, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8604822158813477, + "num_tokens": 349035972.0, + "step": 9147 + }, + { + "epoch": 1.1637196285459865, + "grad_norm": 1.4106768369674683, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8745294809341431, + "num_tokens": 349078724.0, + "step": 9148 + }, + { + "epoch": 1.163846838824577, + "grad_norm": 1.4158506393432617, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8709284067153931, + "num_tokens": 349121838.0, + "step": 9149 + }, + { + "epoch": 1.1639740491031676, + "grad_norm": 1.5634695291519165, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8755012154579163, + "num_tokens": 349158457.0, + "step": 9150 + }, + { + "epoch": 1.1641012593817581, + "grad_norm": 1.5021816492080688, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8870776891708374, + "num_tokens": 349192124.0, + "step": 9151 + }, + { + "epoch": 1.1642284696603487, + "grad_norm": 1.5618151426315308, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8842629194259644, + "num_tokens": 349229129.0, + "step": 9152 + }, + { + "epoch": 1.164355679938939, + "grad_norm": 1.545653223991394, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8706636428833008, + "num_tokens": 349269980.0, + "step": 9153 + }, + { + "epoch": 1.1644828902175295, + "grad_norm": 1.5592756271362305, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8812884092330933, + "num_tokens": 349306266.0, + "step": 9154 + }, + { + "epoch": 1.16461010049612, + "grad_norm": 1.6413445472717285, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8782188892364502, + "num_tokens": 349337877.0, + "step": 9155 + }, + { + "epoch": 1.1647373107747105, + "grad_norm": 1.4459329843521118, + "learning_rate": 1e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.8951307535171509, + "num_tokens": 349373062.0, + "step": 9156 + }, + { + "epoch": 1.164864521053301, + "grad_norm": 1.4437013864517212, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8898137211799622, + "num_tokens": 349410589.0, + "step": 9157 + }, + { + "epoch": 1.1649917313318916, + "grad_norm": 1.6545398235321045, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.864370584487915, + "num_tokens": 349447169.0, + "step": 9158 + }, + { + "epoch": 1.1651189416104821, + "grad_norm": 1.626167893409729, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8770944476127625, + "num_tokens": 349478281.0, + "step": 9159 + }, + { + "epoch": 1.1652461518890727, + "grad_norm": 1.7063852548599243, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.859784722328186, + "num_tokens": 349514293.0, + "step": 9160 + }, + { + "epoch": 1.1653733621676632, + "grad_norm": 1.5585485696792603, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8869494795799255, + "num_tokens": 349546654.0, + "step": 9161 + }, + { + "epoch": 1.1655005724462537, + "grad_norm": 1.7330241203308105, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8797450065612793, + "num_tokens": 349580409.0, + "step": 9162 + }, + { + "epoch": 1.1656277827248442, + "grad_norm": 1.4767279624938965, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8740541934967041, + "num_tokens": 349618292.0, + "step": 9163 + }, + { + "epoch": 1.1657549930034348, + "grad_norm": 1.6633297204971313, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8639023900032043, + "num_tokens": 349655250.0, + "step": 9164 + }, + { + "epoch": 1.1658822032820253, + "grad_norm": 1.541205644607544, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8720672726631165, + "num_tokens": 349694994.0, + "step": 9165 + }, + { + "epoch": 1.1660094135606156, + "grad_norm": 1.488982081413269, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.89642333984375, + "num_tokens": 349733551.0, + "step": 9166 + }, + { + "epoch": 1.1661366238392061, + "grad_norm": 1.5349807739257812, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8871017694473267, + "num_tokens": 349768422.0, + "step": 9167 + }, + { + "epoch": 1.1662638341177967, + "grad_norm": 1.6100982427597046, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8535231351852417, + "num_tokens": 349805423.0, + "step": 9168 + }, + { + "epoch": 1.1663910443963872, + "grad_norm": 1.425745964050293, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8702661991119385, + "num_tokens": 349848590.0, + "step": 9169 + }, + { + "epoch": 1.1665182546749777, + "grad_norm": 1.5001364946365356, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8836857080459595, + "num_tokens": 349889191.0, + "step": 9170 + }, + { + "epoch": 1.1666454649535682, + "grad_norm": 1.3852190971374512, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.884937047958374, + "num_tokens": 349929850.0, + "step": 9171 + }, + { + "epoch": 1.1667726752321588, + "grad_norm": 1.7587864398956299, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8834457397460938, + "num_tokens": 349961656.0, + "step": 9172 + }, + { + "epoch": 1.1668998855107493, + "grad_norm": 1.5830986499786377, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8720048666000366, + "num_tokens": 350001589.0, + "step": 9173 + }, + { + "epoch": 1.1670270957893398, + "grad_norm": 1.6786681413650513, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8629550933837891, + "num_tokens": 350034271.0, + "step": 9174 + }, + { + "epoch": 1.1671543060679304, + "grad_norm": 1.5284943580627441, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8835066556930542, + "num_tokens": 350073418.0, + "step": 9175 + }, + { + "epoch": 1.1672815163465209, + "grad_norm": 1.3983213901519775, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8798131942749023, + "num_tokens": 350114147.0, + "step": 9176 + }, + { + "epoch": 1.1674087266251112, + "grad_norm": 1.4482152462005615, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8881497979164124, + "num_tokens": 350152507.0, + "step": 9177 + }, + { + "epoch": 1.1675359369037017, + "grad_norm": 1.5406087636947632, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.884353756904602, + "num_tokens": 350186591.0, + "step": 9178 + }, + { + "epoch": 1.1676631471822922, + "grad_norm": 1.5529241561889648, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8620514869689941, + "num_tokens": 350222959.0, + "step": 9179 + }, + { + "epoch": 1.1677903574608828, + "grad_norm": 1.3970997333526611, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8885194063186646, + "num_tokens": 350263968.0, + "step": 9180 + }, + { + "epoch": 1.1679175677394733, + "grad_norm": 1.5613327026367188, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8571219444274902, + "num_tokens": 350303301.0, + "step": 9181 + }, + { + "epoch": 1.1680447780180638, + "grad_norm": 1.668578863143921, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8707546591758728, + "num_tokens": 350339157.0, + "step": 9182 + }, + { + "epoch": 1.1681719882966544, + "grad_norm": 1.54710853099823, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8702642321586609, + "num_tokens": 350376033.0, + "step": 9183 + }, + { + "epoch": 1.168299198575245, + "grad_norm": 1.593273401260376, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.860777735710144, + "num_tokens": 350410851.0, + "step": 9184 + }, + { + "epoch": 1.1684264088538354, + "grad_norm": 1.488153100013733, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8750943541526794, + "num_tokens": 350449235.0, + "step": 9185 + }, + { + "epoch": 1.168553619132426, + "grad_norm": 1.5872228145599365, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8690704107284546, + "num_tokens": 350486764.0, + "step": 9186 + }, + { + "epoch": 1.1686808294110165, + "grad_norm": 1.5067228078842163, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.880312979221344, + "num_tokens": 350521152.0, + "step": 9187 + }, + { + "epoch": 1.168808039689607, + "grad_norm": 1.460018277168274, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8705600500106812, + "num_tokens": 350561507.0, + "step": 9188 + }, + { + "epoch": 1.1689352499681975, + "grad_norm": 1.5530239343643188, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8693620562553406, + "num_tokens": 350600881.0, + "step": 9189 + }, + { + "epoch": 1.169062460246788, + "grad_norm": 1.5744352340698242, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8618624210357666, + "num_tokens": 350636674.0, + "step": 9190 + }, + { + "epoch": 1.1691896705253784, + "grad_norm": 1.496343731880188, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8677675724029541, + "num_tokens": 350677875.0, + "step": 9191 + }, + { + "epoch": 1.169316880803969, + "grad_norm": 1.5659432411193848, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8821536898612976, + "num_tokens": 350712915.0, + "step": 9192 + }, + { + "epoch": 1.1694440910825594, + "grad_norm": 1.5955004692077637, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8572352528572083, + "num_tokens": 350751099.0, + "step": 9193 + }, + { + "epoch": 1.16957130136115, + "grad_norm": 1.5971418619155884, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8769670724868774, + "num_tokens": 350785668.0, + "step": 9194 + }, + { + "epoch": 1.1696985116397405, + "grad_norm": 1.4673925638198853, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8924386501312256, + "num_tokens": 350822189.0, + "step": 9195 + }, + { + "epoch": 1.169825721918331, + "grad_norm": 1.4410789012908936, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8784739375114441, + "num_tokens": 350861803.0, + "step": 9196 + }, + { + "epoch": 1.1699529321969215, + "grad_norm": 1.4342408180236816, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8769807815551758, + "num_tokens": 350904583.0, + "step": 9197 + }, + { + "epoch": 1.170080142475512, + "grad_norm": 1.536668300628662, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8638792037963867, + "num_tokens": 350944411.0, + "step": 9198 + }, + { + "epoch": 1.1702073527541026, + "grad_norm": 1.5134034156799316, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8818334341049194, + "num_tokens": 350982469.0, + "step": 9199 + }, + { + "epoch": 1.1703345630326931, + "grad_norm": 1.4627715349197388, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8854192495346069, + "num_tokens": 351021733.0, + "step": 9200 + }, + { + "epoch": 1.1704617733112836, + "grad_norm": 1.3687045574188232, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8747022747993469, + "num_tokens": 351067909.0, + "step": 9201 + }, + { + "epoch": 1.170588983589874, + "grad_norm": 1.4955646991729736, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8884998559951782, + "num_tokens": 351106454.0, + "step": 9202 + }, + { + "epoch": 1.1707161938684645, + "grad_norm": 1.3582401275634766, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8891383409500122, + "num_tokens": 351145238.0, + "step": 9203 + }, + { + "epoch": 1.170843404147055, + "grad_norm": 1.49832284450531, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8740559220314026, + "num_tokens": 351183718.0, + "step": 9204 + }, + { + "epoch": 1.1709706144256455, + "grad_norm": 1.577612280845642, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8636789321899414, + "num_tokens": 351218188.0, + "step": 9205 + }, + { + "epoch": 1.171097824704236, + "grad_norm": 1.3984954357147217, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8697447776794434, + "num_tokens": 351262627.0, + "step": 9206 + }, + { + "epoch": 1.1712250349828266, + "grad_norm": 1.3998767137527466, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8823307752609253, + "num_tokens": 351305786.0, + "step": 9207 + }, + { + "epoch": 1.1713522452614171, + "grad_norm": 1.6943186521530151, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8680311441421509, + "num_tokens": 351339846.0, + "step": 9208 + }, + { + "epoch": 1.1714794555400077, + "grad_norm": 1.4836039543151855, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8792175650596619, + "num_tokens": 351377181.0, + "step": 9209 + }, + { + "epoch": 1.1716066658185982, + "grad_norm": 1.4143191576004028, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8661876916885376, + "num_tokens": 351423235.0, + "step": 9210 + }, + { + "epoch": 1.1717338760971887, + "grad_norm": 1.5091732740402222, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8773607015609741, + "num_tokens": 351460740.0, + "step": 9211 + }, + { + "epoch": 1.1718610863757792, + "grad_norm": 1.6229491233825684, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8840988874435425, + "num_tokens": 351494259.0, + "step": 9212 + }, + { + "epoch": 1.1719882966543698, + "grad_norm": 1.7674288749694824, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8795623183250427, + "num_tokens": 351523142.0, + "step": 9213 + }, + { + "epoch": 1.1721155069329603, + "grad_norm": 1.53864586353302, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8633354902267456, + "num_tokens": 351561707.0, + "step": 9214 + }, + { + "epoch": 1.1722427172115506, + "grad_norm": 1.5002726316452026, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8697379231452942, + "num_tokens": 351601924.0, + "step": 9215 + }, + { + "epoch": 1.1723699274901411, + "grad_norm": 1.558853268623352, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8694050312042236, + "num_tokens": 351639456.0, + "step": 9216 + }, + { + "epoch": 1.1724971377687317, + "grad_norm": 1.4790724515914917, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.882346510887146, + "num_tokens": 351679283.0, + "step": 9217 + }, + { + "epoch": 1.1726243480473222, + "grad_norm": 1.5013543367385864, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8709350228309631, + "num_tokens": 351719005.0, + "step": 9218 + }, + { + "epoch": 1.1727515583259127, + "grad_norm": 1.4813753366470337, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.875329852104187, + "num_tokens": 351755712.0, + "step": 9219 + }, + { + "epoch": 1.1728787686045032, + "grad_norm": 1.4985945224761963, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8721170425415039, + "num_tokens": 351796754.0, + "step": 9220 + }, + { + "epoch": 1.1730059788830938, + "grad_norm": 1.5108624696731567, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8841070532798767, + "num_tokens": 351831781.0, + "step": 9221 + }, + { + "epoch": 1.1731331891616843, + "grad_norm": 1.3074902296066284, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8901911973953247, + "num_tokens": 351875709.0, + "step": 9222 + }, + { + "epoch": 1.1732603994402748, + "grad_norm": 1.5338220596313477, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8892838954925537, + "num_tokens": 351911494.0, + "step": 9223 + }, + { + "epoch": 1.1733876097188654, + "grad_norm": 1.5790371894836426, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8642226457595825, + "num_tokens": 351946645.0, + "step": 9224 + }, + { + "epoch": 1.1735148199974559, + "grad_norm": 1.662564754486084, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8709136843681335, + "num_tokens": 351982154.0, + "step": 9225 + }, + { + "epoch": 1.1736420302760462, + "grad_norm": 1.6891765594482422, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8789459466934204, + "num_tokens": 352013743.0, + "step": 9226 + }, + { + "epoch": 1.1737692405546367, + "grad_norm": 1.4603800773620605, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8722914457321167, + "num_tokens": 352058606.0, + "step": 9227 + }, + { + "epoch": 1.1738964508332272, + "grad_norm": 1.6713799238204956, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8816797733306885, + "num_tokens": 352093044.0, + "step": 9228 + }, + { + "epoch": 1.1740236611118178, + "grad_norm": 1.5141408443450928, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.867104172706604, + "num_tokens": 352131103.0, + "step": 9229 + }, + { + "epoch": 1.1741508713904083, + "grad_norm": 1.521239995956421, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.871527910232544, + "num_tokens": 352165974.0, + "step": 9230 + }, + { + "epoch": 1.1742780816689988, + "grad_norm": 1.608397126197815, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8615427017211914, + "num_tokens": 352203631.0, + "step": 9231 + }, + { + "epoch": 1.1744052919475894, + "grad_norm": 1.5260522365570068, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8767012357711792, + "num_tokens": 352243563.0, + "step": 9232 + }, + { + "epoch": 1.1745325022261799, + "grad_norm": 1.6253422498703003, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.867840051651001, + "num_tokens": 352276892.0, + "step": 9233 + }, + { + "epoch": 1.1746597125047704, + "grad_norm": 1.540748953819275, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8814654350280762, + "num_tokens": 352315235.0, + "step": 9234 + }, + { + "epoch": 1.174786922783361, + "grad_norm": 1.6101256608963013, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8761079907417297, + "num_tokens": 352348534.0, + "step": 9235 + }, + { + "epoch": 1.1749141330619515, + "grad_norm": 1.486952304840088, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8714123964309692, + "num_tokens": 352389400.0, + "step": 9236 + }, + { + "epoch": 1.175041343340542, + "grad_norm": 1.5907137393951416, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8673217296600342, + "num_tokens": 352426957.0, + "step": 9237 + }, + { + "epoch": 1.1751685536191325, + "grad_norm": 1.4139033555984497, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8894250988960266, + "num_tokens": 352465264.0, + "step": 9238 + }, + { + "epoch": 1.175295763897723, + "grad_norm": 1.39711594581604, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8729994297027588, + "num_tokens": 352513548.0, + "step": 9239 + }, + { + "epoch": 1.1754229741763134, + "grad_norm": 1.6238231658935547, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8614670038223267, + "num_tokens": 352549553.0, + "step": 9240 + }, + { + "epoch": 1.175550184454904, + "grad_norm": 1.4309649467468262, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8720136284828186, + "num_tokens": 352589461.0, + "step": 9241 + }, + { + "epoch": 1.1756773947334944, + "grad_norm": 1.3992301225662231, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8862781524658203, + "num_tokens": 352630861.0, + "step": 9242 + }, + { + "epoch": 1.175804605012085, + "grad_norm": 1.4595078229904175, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8869011402130127, + "num_tokens": 352666525.0, + "step": 9243 + }, + { + "epoch": 1.1759318152906755, + "grad_norm": 1.7713613510131836, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8824695944786072, + "num_tokens": 352701423.0, + "step": 9244 + }, + { + "epoch": 1.176059025569266, + "grad_norm": 1.5457793474197388, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8743917942047119, + "num_tokens": 352739927.0, + "step": 9245 + }, + { + "epoch": 1.1761862358478565, + "grad_norm": 1.6500132083892822, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8473174571990967, + "num_tokens": 352779406.0, + "step": 9246 + }, + { + "epoch": 1.176313446126447, + "grad_norm": 1.6063690185546875, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8671446442604065, + "num_tokens": 352819229.0, + "step": 9247 + }, + { + "epoch": 1.1764406564050376, + "grad_norm": 1.432407021522522, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8868869543075562, + "num_tokens": 352857057.0, + "step": 9248 + }, + { + "epoch": 1.1765678666836281, + "grad_norm": 1.5023339986801147, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8688397407531738, + "num_tokens": 352898301.0, + "step": 9249 + }, + { + "epoch": 1.1766950769622184, + "grad_norm": 1.5363185405731201, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8671088218688965, + "num_tokens": 352935699.0, + "step": 9250 + }, + { + "epoch": 1.176822287240809, + "grad_norm": 1.551042914390564, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8860085010528564, + "num_tokens": 352969486.0, + "step": 9251 + }, + { + "epoch": 1.1769494975193995, + "grad_norm": 1.5955275297164917, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8714239597320557, + "num_tokens": 353004211.0, + "step": 9252 + }, + { + "epoch": 1.17707670779799, + "grad_norm": 1.469531774520874, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8850288391113281, + "num_tokens": 353045325.0, + "step": 9253 + }, + { + "epoch": 1.1772039180765805, + "grad_norm": 1.4795809984207153, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8757968544960022, + "num_tokens": 353085843.0, + "step": 9254 + }, + { + "epoch": 1.177331128355171, + "grad_norm": 1.6380615234375, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8680220246315002, + "num_tokens": 353123562.0, + "step": 9255 + }, + { + "epoch": 1.1774583386337616, + "grad_norm": 1.7520486116409302, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8723640441894531, + "num_tokens": 353158328.0, + "step": 9256 + }, + { + "epoch": 1.1775855489123521, + "grad_norm": 1.3467563390731812, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8891265392303467, + "num_tokens": 353201328.0, + "step": 9257 + }, + { + "epoch": 1.1777127591909426, + "grad_norm": 1.6115087270736694, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8859924077987671, + "num_tokens": 353234148.0, + "step": 9258 + }, + { + "epoch": 1.1778399694695332, + "grad_norm": 1.485191822052002, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8835379481315613, + "num_tokens": 353273104.0, + "step": 9259 + }, + { + "epoch": 1.1779671797481237, + "grad_norm": 1.5182111263275146, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8690131902694702, + "num_tokens": 353314759.0, + "step": 9260 + }, + { + "epoch": 1.1780943900267142, + "grad_norm": 1.4502933025360107, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8772116899490356, + "num_tokens": 353358682.0, + "step": 9261 + }, + { + "epoch": 1.1782216003053048, + "grad_norm": 1.6000021696090698, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8676155805587769, + "num_tokens": 353394871.0, + "step": 9262 + }, + { + "epoch": 1.1783488105838953, + "grad_norm": 1.4027146100997925, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8891037702560425, + "num_tokens": 353434769.0, + "step": 9263 + }, + { + "epoch": 1.1784760208624856, + "grad_norm": 1.596200704574585, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8644574880599976, + "num_tokens": 353473034.0, + "step": 9264 + }, + { + "epoch": 1.1786032311410761, + "grad_norm": 1.2758113145828247, + "learning_rate": 1e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.8949528932571411, + "num_tokens": 353517900.0, + "step": 9265 + }, + { + "epoch": 1.1787304414196667, + "grad_norm": 1.542988657951355, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8847863078117371, + "num_tokens": 353551836.0, + "step": 9266 + }, + { + "epoch": 1.1788576516982572, + "grad_norm": 1.5178723335266113, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8745309114456177, + "num_tokens": 353588441.0, + "step": 9267 + }, + { + "epoch": 1.1789848619768477, + "grad_norm": 1.4315125942230225, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8814213275909424, + "num_tokens": 353627245.0, + "step": 9268 + }, + { + "epoch": 1.1791120722554382, + "grad_norm": 1.3602077960968018, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8867143392562866, + "num_tokens": 353669585.0, + "step": 9269 + }, + { + "epoch": 1.1792392825340288, + "grad_norm": 1.3934077024459839, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8775784373283386, + "num_tokens": 353712539.0, + "step": 9270 + }, + { + "epoch": 1.1793664928126193, + "grad_norm": 1.4045486450195312, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.885167121887207, + "num_tokens": 353753552.0, + "step": 9271 + }, + { + "epoch": 1.1794937030912098, + "grad_norm": 1.5594782829284668, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8669394254684448, + "num_tokens": 353792672.0, + "step": 9272 + }, + { + "epoch": 1.1796209133698004, + "grad_norm": 1.5053941011428833, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8840829730033875, + "num_tokens": 353831757.0, + "step": 9273 + }, + { + "epoch": 1.1797481236483909, + "grad_norm": 1.4828581809997559, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8906480073928833, + "num_tokens": 353870022.0, + "step": 9274 + }, + { + "epoch": 1.1798753339269812, + "grad_norm": 1.5359232425689697, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8834053874015808, + "num_tokens": 353906854.0, + "step": 9275 + }, + { + "epoch": 1.1800025442055717, + "grad_norm": 1.6546989679336548, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8814823627471924, + "num_tokens": 353937155.0, + "step": 9276 + }, + { + "epoch": 1.1801297544841622, + "grad_norm": 1.6081933975219727, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.866949737071991, + "num_tokens": 353974457.0, + "step": 9277 + }, + { + "epoch": 1.1802569647627528, + "grad_norm": 1.538074016571045, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8871221542358398, + "num_tokens": 354008499.0, + "step": 9278 + }, + { + "epoch": 1.1803841750413433, + "grad_norm": 1.533789038658142, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.872477114200592, + "num_tokens": 354045922.0, + "step": 9279 + }, + { + "epoch": 1.1805113853199338, + "grad_norm": 1.648596167564392, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8624259233474731, + "num_tokens": 354082532.0, + "step": 9280 + }, + { + "epoch": 1.1806385955985244, + "grad_norm": 1.4753539562225342, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8728073239326477, + "num_tokens": 354124410.0, + "step": 9281 + }, + { + "epoch": 1.1807658058771149, + "grad_norm": 1.5205374956130981, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8795100450515747, + "num_tokens": 354161346.0, + "step": 9282 + }, + { + "epoch": 1.1808930161557054, + "grad_norm": 1.5402873754501343, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8807774782180786, + "num_tokens": 354201914.0, + "step": 9283 + }, + { + "epoch": 1.181020226434296, + "grad_norm": 1.551416277885437, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8709388971328735, + "num_tokens": 354237939.0, + "step": 9284 + }, + { + "epoch": 1.1811474367128865, + "grad_norm": 1.5025670528411865, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8780121207237244, + "num_tokens": 354277031.0, + "step": 9285 + }, + { + "epoch": 1.181274646991477, + "grad_norm": 1.5294688940048218, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.868097186088562, + "num_tokens": 354312922.0, + "step": 9286 + }, + { + "epoch": 1.1814018572700675, + "grad_norm": 1.486059308052063, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8650322556495667, + "num_tokens": 354353702.0, + "step": 9287 + }, + { + "epoch": 1.181529067548658, + "grad_norm": 1.6287546157836914, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8696058988571167, + "num_tokens": 354390075.0, + "step": 9288 + }, + { + "epoch": 1.1816562778272484, + "grad_norm": 1.5360956192016602, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8788308501243591, + "num_tokens": 354430404.0, + "step": 9289 + }, + { + "epoch": 1.1817834881058389, + "grad_norm": 1.4129788875579834, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.885372519493103, + "num_tokens": 354467826.0, + "step": 9290 + }, + { + "epoch": 1.1819106983844294, + "grad_norm": 1.3381739854812622, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.886899471282959, + "num_tokens": 354512610.0, + "step": 9291 + }, + { + "epoch": 1.18203790866302, + "grad_norm": 1.602634310722351, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8884426355361938, + "num_tokens": 354548894.0, + "step": 9292 + }, + { + "epoch": 1.1821651189416105, + "grad_norm": 1.5980168581008911, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8631960153579712, + "num_tokens": 354587672.0, + "step": 9293 + }, + { + "epoch": 1.182292329220201, + "grad_norm": 1.5939668416976929, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8633781671524048, + "num_tokens": 354626632.0, + "step": 9294 + }, + { + "epoch": 1.1824195394987915, + "grad_norm": 1.391152262687683, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8827939629554749, + "num_tokens": 354666010.0, + "step": 9295 + }, + { + "epoch": 1.182546749777382, + "grad_norm": 1.5572718381881714, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8828192949295044, + "num_tokens": 354705802.0, + "step": 9296 + }, + { + "epoch": 1.1826739600559726, + "grad_norm": 1.4554827213287354, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8687366247177124, + "num_tokens": 354748833.0, + "step": 9297 + }, + { + "epoch": 1.1828011703345631, + "grad_norm": 1.6276440620422363, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8655439615249634, + "num_tokens": 354784918.0, + "step": 9298 + }, + { + "epoch": 1.1829283806131534, + "grad_norm": 1.7177140712738037, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8737022280693054, + "num_tokens": 354816700.0, + "step": 9299 + }, + { + "epoch": 1.183055590891744, + "grad_norm": 1.5436608791351318, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8533293008804321, + "num_tokens": 354859527.0, + "step": 9300 + }, + { + "epoch": 1.1831828011703345, + "grad_norm": 1.5729399919509888, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8765004277229309, + "num_tokens": 354897019.0, + "step": 9301 + }, + { + "epoch": 1.183310011448925, + "grad_norm": 1.4003227949142456, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8837810754776001, + "num_tokens": 354937141.0, + "step": 9302 + }, + { + "epoch": 1.1834372217275155, + "grad_norm": 1.420034646987915, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8881887197494507, + "num_tokens": 354976114.0, + "step": 9303 + }, + { + "epoch": 1.183564432006106, + "grad_norm": 1.4160641431808472, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8801071643829346, + "num_tokens": 355016198.0, + "step": 9304 + }, + { + "epoch": 1.1836916422846966, + "grad_norm": 1.3776170015335083, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8827090859413147, + "num_tokens": 355060596.0, + "step": 9305 + }, + { + "epoch": 1.1838188525632871, + "grad_norm": 1.7363756895065308, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8671494722366333, + "num_tokens": 355091564.0, + "step": 9306 + }, + { + "epoch": 1.1839460628418776, + "grad_norm": 1.6084063053131104, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8625738024711609, + "num_tokens": 355129066.0, + "step": 9307 + }, + { + "epoch": 1.1840732731204682, + "grad_norm": 1.6285057067871094, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8756458163261414, + "num_tokens": 355161202.0, + "step": 9308 + }, + { + "epoch": 1.1842004833990587, + "grad_norm": 1.5366743803024292, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8740975260734558, + "num_tokens": 355197124.0, + "step": 9309 + }, + { + "epoch": 1.1843276936776492, + "grad_norm": 1.48235023021698, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8845851421356201, + "num_tokens": 355234308.0, + "step": 9310 + }, + { + "epoch": 1.1844549039562398, + "grad_norm": 1.5559732913970947, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8726020455360413, + "num_tokens": 355271024.0, + "step": 9311 + }, + { + "epoch": 1.1845821142348303, + "grad_norm": 1.5808919668197632, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8846536874771118, + "num_tokens": 355307463.0, + "step": 9312 + }, + { + "epoch": 1.1847093245134206, + "grad_norm": 1.4403157234191895, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8885483741760254, + "num_tokens": 355349246.0, + "step": 9313 + }, + { + "epoch": 1.1848365347920111, + "grad_norm": 1.3622066974639893, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8823918104171753, + "num_tokens": 355395868.0, + "step": 9314 + }, + { + "epoch": 1.1849637450706016, + "grad_norm": 1.5169591903686523, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.870642900466919, + "num_tokens": 355435056.0, + "step": 9315 + }, + { + "epoch": 1.1850909553491922, + "grad_norm": 1.465184211730957, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8926688432693481, + "num_tokens": 355471821.0, + "step": 9316 + }, + { + "epoch": 1.1852181656277827, + "grad_norm": 1.5913342237472534, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8701093196868896, + "num_tokens": 355507176.0, + "step": 9317 + }, + { + "epoch": 1.1853453759063732, + "grad_norm": 1.4860118627548218, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8773685693740845, + "num_tokens": 355549828.0, + "step": 9318 + }, + { + "epoch": 1.1854725861849638, + "grad_norm": 1.5452733039855957, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8777332305908203, + "num_tokens": 355586833.0, + "step": 9319 + }, + { + "epoch": 1.1855997964635543, + "grad_norm": 1.5006470680236816, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8709137439727783, + "num_tokens": 355629251.0, + "step": 9320 + }, + { + "epoch": 1.1857270067421448, + "grad_norm": 1.585479497909546, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8706071376800537, + "num_tokens": 355669658.0, + "step": 9321 + }, + { + "epoch": 1.1858542170207353, + "grad_norm": 1.7329243421554565, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8594847321510315, + "num_tokens": 355703666.0, + "step": 9322 + }, + { + "epoch": 1.1859814272993259, + "grad_norm": 1.4300845861434937, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8890901803970337, + "num_tokens": 355741983.0, + "step": 9323 + }, + { + "epoch": 1.1861086375779162, + "grad_norm": 1.7797050476074219, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8790349960327148, + "num_tokens": 355771029.0, + "step": 9324 + }, + { + "epoch": 1.1862358478565067, + "grad_norm": 1.5919559001922607, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8754118084907532, + "num_tokens": 355809831.0, + "step": 9325 + }, + { + "epoch": 1.1863630581350972, + "grad_norm": 1.5333950519561768, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8594611287117004, + "num_tokens": 355852537.0, + "step": 9326 + }, + { + "epoch": 1.1864902684136878, + "grad_norm": 1.3607577085494995, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8680012226104736, + "num_tokens": 355895171.0, + "step": 9327 + }, + { + "epoch": 1.1866174786922783, + "grad_norm": 1.4166535139083862, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8696501851081848, + "num_tokens": 355938418.0, + "step": 9328 + }, + { + "epoch": 1.1867446889708688, + "grad_norm": 1.5458588600158691, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8850782513618469, + "num_tokens": 355973727.0, + "step": 9329 + }, + { + "epoch": 1.1868718992494594, + "grad_norm": 1.5043975114822388, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8747342228889465, + "num_tokens": 356010860.0, + "step": 9330 + }, + { + "epoch": 1.1869991095280499, + "grad_norm": 1.4940721988677979, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.88213050365448, + "num_tokens": 356048013.0, + "step": 9331 + }, + { + "epoch": 1.1871263198066404, + "grad_norm": 1.4377778768539429, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8787772059440613, + "num_tokens": 356090268.0, + "step": 9332 + }, + { + "epoch": 1.187253530085231, + "grad_norm": 1.5235810279846191, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.887326180934906, + "num_tokens": 356125470.0, + "step": 9333 + }, + { + "epoch": 1.1873807403638215, + "grad_norm": 1.50026273727417, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8739044070243835, + "num_tokens": 356168119.0, + "step": 9334 + }, + { + "epoch": 1.187507950642412, + "grad_norm": 1.4918636083602905, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8797672390937805, + "num_tokens": 356208382.0, + "step": 9335 + }, + { + "epoch": 1.1876351609210025, + "grad_norm": 1.4735714197158813, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8769252300262451, + "num_tokens": 356251378.0, + "step": 9336 + }, + { + "epoch": 1.187762371199593, + "grad_norm": 1.5385901927947998, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8682268261909485, + "num_tokens": 356294325.0, + "step": 9337 + }, + { + "epoch": 1.1878895814781834, + "grad_norm": 1.6664146184921265, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8669158220291138, + "num_tokens": 356331503.0, + "step": 9338 + }, + { + "epoch": 1.1880167917567739, + "grad_norm": 1.440178632736206, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8739280104637146, + "num_tokens": 356375654.0, + "step": 9339 + }, + { + "epoch": 1.1881440020353644, + "grad_norm": 1.5388824939727783, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8850809335708618, + "num_tokens": 356409714.0, + "step": 9340 + }, + { + "epoch": 1.188271212313955, + "grad_norm": 1.38274347782135, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8858969807624817, + "num_tokens": 356454576.0, + "step": 9341 + }, + { + "epoch": 1.1883984225925455, + "grad_norm": 1.5199921131134033, + "learning_rate": 1e-06, + "loss": 0.2836, + "mean_token_accuracy": 0.8997359275817871, + "num_tokens": 356490637.0, + "step": 9342 + }, + { + "epoch": 1.188525632871136, + "grad_norm": 1.5533080101013184, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8712674975395203, + "num_tokens": 356527772.0, + "step": 9343 + }, + { + "epoch": 1.1886528431497265, + "grad_norm": 1.5366392135620117, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8665682673454285, + "num_tokens": 356563791.0, + "step": 9344 + }, + { + "epoch": 1.188780053428317, + "grad_norm": 1.554705262184143, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8675236701965332, + "num_tokens": 356600972.0, + "step": 9345 + }, + { + "epoch": 1.1889072637069076, + "grad_norm": 1.4297406673431396, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8912714719772339, + "num_tokens": 356638855.0, + "step": 9346 + }, + { + "epoch": 1.189034473985498, + "grad_norm": 1.4648605585098267, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.87892746925354, + "num_tokens": 356675231.0, + "step": 9347 + }, + { + "epoch": 1.1891616842640884, + "grad_norm": 1.5804071426391602, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8811830282211304, + "num_tokens": 356709573.0, + "step": 9348 + }, + { + "epoch": 1.189288894542679, + "grad_norm": 1.5690892934799194, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8718147277832031, + "num_tokens": 356741366.0, + "step": 9349 + }, + { + "epoch": 1.1894161048212695, + "grad_norm": 1.5147902965545654, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8817696571350098, + "num_tokens": 356783665.0, + "step": 9350 + }, + { + "epoch": 1.18954331509986, + "grad_norm": 1.64415442943573, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8882904052734375, + "num_tokens": 356815930.0, + "step": 9351 + }, + { + "epoch": 1.1896705253784505, + "grad_norm": 1.4605494737625122, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8667984008789062, + "num_tokens": 356858530.0, + "step": 9352 + }, + { + "epoch": 1.189797735657041, + "grad_norm": 1.5316561460494995, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8719179630279541, + "num_tokens": 356899247.0, + "step": 9353 + }, + { + "epoch": 1.1899249459356316, + "grad_norm": 1.4231525659561157, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8736491203308105, + "num_tokens": 356939089.0, + "step": 9354 + }, + { + "epoch": 1.1900521562142221, + "grad_norm": 1.7091654539108276, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8838005065917969, + "num_tokens": 356971665.0, + "step": 9355 + }, + { + "epoch": 1.1901793664928126, + "grad_norm": 1.4466649293899536, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8854070901870728, + "num_tokens": 357010625.0, + "step": 9356 + }, + { + "epoch": 1.1903065767714032, + "grad_norm": 1.5113550424575806, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8820673823356628, + "num_tokens": 357049437.0, + "step": 9357 + }, + { + "epoch": 1.1904337870499937, + "grad_norm": 1.5036066770553589, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8872648477554321, + "num_tokens": 357084595.0, + "step": 9358 + }, + { + "epoch": 1.1905609973285842, + "grad_norm": 1.593661904335022, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8708237409591675, + "num_tokens": 357121435.0, + "step": 9359 + }, + { + "epoch": 1.1906882076071748, + "grad_norm": 1.6111233234405518, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8880497217178345, + "num_tokens": 357156073.0, + "step": 9360 + }, + { + "epoch": 1.1908154178857653, + "grad_norm": 1.3998664617538452, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.882007360458374, + "num_tokens": 357198376.0, + "step": 9361 + }, + { + "epoch": 1.1909426281643556, + "grad_norm": 1.4715949296951294, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8775895833969116, + "num_tokens": 357235716.0, + "step": 9362 + }, + { + "epoch": 1.1910698384429461, + "grad_norm": 1.5286434888839722, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8784006834030151, + "num_tokens": 357272482.0, + "step": 9363 + }, + { + "epoch": 1.1911970487215366, + "grad_norm": 1.57130765914917, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8663073182106018, + "num_tokens": 357307159.0, + "step": 9364 + }, + { + "epoch": 1.1913242590001272, + "grad_norm": 1.4533531665802002, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8740658164024353, + "num_tokens": 357348636.0, + "step": 9365 + }, + { + "epoch": 1.1914514692787177, + "grad_norm": 1.6102720499038696, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8678374886512756, + "num_tokens": 357384488.0, + "step": 9366 + }, + { + "epoch": 1.1915786795573082, + "grad_norm": 1.5191361904144287, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8894516825675964, + "num_tokens": 357420760.0, + "step": 9367 + }, + { + "epoch": 1.1917058898358988, + "grad_norm": 1.4459574222564697, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8650615811347961, + "num_tokens": 357466579.0, + "step": 9368 + }, + { + "epoch": 1.1918331001144893, + "grad_norm": 1.5696872472763062, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8564022779464722, + "num_tokens": 357507086.0, + "step": 9369 + }, + { + "epoch": 1.1919603103930798, + "grad_norm": 1.5877865552902222, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8546037673950195, + "num_tokens": 357546159.0, + "step": 9370 + }, + { + "epoch": 1.1920875206716703, + "grad_norm": 1.5231728553771973, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8752306699752808, + "num_tokens": 357582859.0, + "step": 9371 + }, + { + "epoch": 1.1922147309502609, + "grad_norm": 1.4789711236953735, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8892741203308105, + "num_tokens": 357624083.0, + "step": 9372 + }, + { + "epoch": 1.1923419412288512, + "grad_norm": 1.4365997314453125, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.881960391998291, + "num_tokens": 357663971.0, + "step": 9373 + }, + { + "epoch": 1.1924691515074417, + "grad_norm": 1.490235686302185, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8787686228752136, + "num_tokens": 357700809.0, + "step": 9374 + }, + { + "epoch": 1.1925963617860322, + "grad_norm": 1.6162822246551514, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8791359663009644, + "num_tokens": 357733424.0, + "step": 9375 + }, + { + "epoch": 1.1927235720646228, + "grad_norm": 1.5888856649398804, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8727407455444336, + "num_tokens": 357767030.0, + "step": 9376 + }, + { + "epoch": 1.1928507823432133, + "grad_norm": 1.5038771629333496, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8716224431991577, + "num_tokens": 357807284.0, + "step": 9377 + }, + { + "epoch": 1.1929779926218038, + "grad_norm": 1.3973716497421265, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8755722641944885, + "num_tokens": 357850495.0, + "step": 9378 + }, + { + "epoch": 1.1931052029003943, + "grad_norm": 1.5522280931472778, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8661007881164551, + "num_tokens": 357889317.0, + "step": 9379 + }, + { + "epoch": 1.1932324131789849, + "grad_norm": 1.6659281253814697, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8738280534744263, + "num_tokens": 357922779.0, + "step": 9380 + }, + { + "epoch": 1.1933596234575754, + "grad_norm": 1.3879982233047485, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8741447925567627, + "num_tokens": 357965083.0, + "step": 9381 + }, + { + "epoch": 1.193486833736166, + "grad_norm": 1.4335081577301025, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8960429430007935, + "num_tokens": 358002722.0, + "step": 9382 + }, + { + "epoch": 1.1936140440147565, + "grad_norm": 1.5505588054656982, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.873631477355957, + "num_tokens": 358039581.0, + "step": 9383 + }, + { + "epoch": 1.193741254293347, + "grad_norm": 1.6523984670639038, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8655948042869568, + "num_tokens": 358073216.0, + "step": 9384 + }, + { + "epoch": 1.1938684645719375, + "grad_norm": 1.4318268299102783, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8791972398757935, + "num_tokens": 358112777.0, + "step": 9385 + }, + { + "epoch": 1.193995674850528, + "grad_norm": 1.6495026350021362, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8656297922134399, + "num_tokens": 358146730.0, + "step": 9386 + }, + { + "epoch": 1.1941228851291183, + "grad_norm": 1.554517388343811, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8829638957977295, + "num_tokens": 358184973.0, + "step": 9387 + }, + { + "epoch": 1.1942500954077089, + "grad_norm": 1.6708873510360718, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8778952360153198, + "num_tokens": 358216904.0, + "step": 9388 + }, + { + "epoch": 1.1943773056862994, + "grad_norm": 1.5062787532806396, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8694663643836975, + "num_tokens": 358257060.0, + "step": 9389 + }, + { + "epoch": 1.19450451596489, + "grad_norm": 1.4837124347686768, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.884233295917511, + "num_tokens": 358295461.0, + "step": 9390 + }, + { + "epoch": 1.1946317262434805, + "grad_norm": 1.4485729932785034, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8787189722061157, + "num_tokens": 358335484.0, + "step": 9391 + }, + { + "epoch": 1.194758936522071, + "grad_norm": 1.4797475337982178, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.880944550037384, + "num_tokens": 358373871.0, + "step": 9392 + }, + { + "epoch": 1.1948861468006615, + "grad_norm": 1.464734673500061, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8674554228782654, + "num_tokens": 358415529.0, + "step": 9393 + }, + { + "epoch": 1.195013357079252, + "grad_norm": 1.5252861976623535, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8783361911773682, + "num_tokens": 358454446.0, + "step": 9394 + }, + { + "epoch": 1.1951405673578426, + "grad_norm": 1.5044323205947876, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8698078393936157, + "num_tokens": 358494283.0, + "step": 9395 + }, + { + "epoch": 1.195267777636433, + "grad_norm": 1.5187292098999023, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8869049549102783, + "num_tokens": 358529556.0, + "step": 9396 + }, + { + "epoch": 1.1953949879150234, + "grad_norm": 1.525091528892517, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8810333013534546, + "num_tokens": 358566575.0, + "step": 9397 + }, + { + "epoch": 1.195522198193614, + "grad_norm": 1.6399649381637573, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.879056990146637, + "num_tokens": 358600733.0, + "step": 9398 + }, + { + "epoch": 1.1956494084722045, + "grad_norm": 1.4421895742416382, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8756675124168396, + "num_tokens": 358643004.0, + "step": 9399 + }, + { + "epoch": 1.195776618750795, + "grad_norm": 1.4323511123657227, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8723349571228027, + "num_tokens": 358685832.0, + "step": 9400 + }, + { + "epoch": 1.1959038290293855, + "grad_norm": 1.5163363218307495, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8722573518753052, + "num_tokens": 358729686.0, + "step": 9401 + }, + { + "epoch": 1.196031039307976, + "grad_norm": 1.5192461013793945, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8831350803375244, + "num_tokens": 358767562.0, + "step": 9402 + }, + { + "epoch": 1.1961582495865666, + "grad_norm": 1.6764914989471436, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8775291442871094, + "num_tokens": 358799961.0, + "step": 9403 + }, + { + "epoch": 1.196285459865157, + "grad_norm": 1.5175977945327759, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8726105690002441, + "num_tokens": 358839501.0, + "step": 9404 + }, + { + "epoch": 1.1964126701437476, + "grad_norm": 1.4889191389083862, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8713802099227905, + "num_tokens": 358879671.0, + "step": 9405 + }, + { + "epoch": 1.1965398804223382, + "grad_norm": 1.5841234922409058, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8852337598800659, + "num_tokens": 358914710.0, + "step": 9406 + }, + { + "epoch": 1.1966670907009287, + "grad_norm": 1.6705825328826904, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.873683750629425, + "num_tokens": 358947493.0, + "step": 9407 + }, + { + "epoch": 1.1967943009795192, + "grad_norm": 1.474677324295044, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8790408968925476, + "num_tokens": 358988139.0, + "step": 9408 + }, + { + "epoch": 1.1969215112581097, + "grad_norm": 1.4862558841705322, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8843786120414734, + "num_tokens": 359025721.0, + "step": 9409 + }, + { + "epoch": 1.1970487215367003, + "grad_norm": 1.386808156967163, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.867659330368042, + "num_tokens": 359070489.0, + "step": 9410 + }, + { + "epoch": 1.1971759318152906, + "grad_norm": 1.42279851436615, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8905921578407288, + "num_tokens": 359110846.0, + "step": 9411 + }, + { + "epoch": 1.1973031420938811, + "grad_norm": 1.3776965141296387, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.883675217628479, + "num_tokens": 359154151.0, + "step": 9412 + }, + { + "epoch": 1.1974303523724716, + "grad_norm": 1.520687222480774, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8907152414321899, + "num_tokens": 359188175.0, + "step": 9413 + }, + { + "epoch": 1.1975575626510622, + "grad_norm": 1.5066231489181519, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8714790344238281, + "num_tokens": 359227532.0, + "step": 9414 + }, + { + "epoch": 1.1976847729296527, + "grad_norm": 1.6033681631088257, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8639088869094849, + "num_tokens": 359263011.0, + "step": 9415 + }, + { + "epoch": 1.1978119832082432, + "grad_norm": 1.5094635486602783, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8786216974258423, + "num_tokens": 359301214.0, + "step": 9416 + }, + { + "epoch": 1.1979391934868338, + "grad_norm": 1.5871915817260742, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8491170406341553, + "num_tokens": 359342338.0, + "step": 9417 + }, + { + "epoch": 1.1980664037654243, + "grad_norm": 1.465536117553711, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8833670616149902, + "num_tokens": 359384031.0, + "step": 9418 + }, + { + "epoch": 1.1981936140440148, + "grad_norm": 1.7407580614089966, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8744702935218811, + "num_tokens": 359415847.0, + "step": 9419 + }, + { + "epoch": 1.1983208243226053, + "grad_norm": 1.4655396938323975, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8813031911849976, + "num_tokens": 359456183.0, + "step": 9420 + }, + { + "epoch": 1.1984480346011959, + "grad_norm": 1.6728004217147827, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8860160112380981, + "num_tokens": 359487500.0, + "step": 9421 + }, + { + "epoch": 1.1985752448797862, + "grad_norm": 1.551835060119629, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.868237316608429, + "num_tokens": 359526212.0, + "step": 9422 + }, + { + "epoch": 1.1987024551583767, + "grad_norm": 1.67050302028656, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8619776964187622, + "num_tokens": 359560129.0, + "step": 9423 + }, + { + "epoch": 1.1988296654369672, + "grad_norm": 1.4721001386642456, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8675351142883301, + "num_tokens": 359602538.0, + "step": 9424 + }, + { + "epoch": 1.1989568757155578, + "grad_norm": 1.5880171060562134, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8807066679000854, + "num_tokens": 359636750.0, + "step": 9425 + }, + { + "epoch": 1.1990840859941483, + "grad_norm": 1.423882007598877, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8691930770874023, + "num_tokens": 359676494.0, + "step": 9426 + }, + { + "epoch": 1.1992112962727388, + "grad_norm": 1.4741681814193726, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8816053867340088, + "num_tokens": 359713078.0, + "step": 9427 + }, + { + "epoch": 1.1993385065513293, + "grad_norm": 1.5386743545532227, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.870907187461853, + "num_tokens": 359749989.0, + "step": 9428 + }, + { + "epoch": 1.1994657168299199, + "grad_norm": 1.5118176937103271, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8771063089370728, + "num_tokens": 359784029.0, + "step": 9429 + }, + { + "epoch": 1.1995929271085104, + "grad_norm": 1.4536890983581543, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8727006912231445, + "num_tokens": 359824682.0, + "step": 9430 + }, + { + "epoch": 1.199720137387101, + "grad_norm": 1.5393973588943481, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8847575187683105, + "num_tokens": 359865048.0, + "step": 9431 + }, + { + "epoch": 1.1998473476656915, + "grad_norm": 1.4148457050323486, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8766569495201111, + "num_tokens": 359907581.0, + "step": 9432 + }, + { + "epoch": 1.199974557944282, + "grad_norm": 1.3844127655029297, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8768567442893982, + "num_tokens": 359952438.0, + "step": 9433 + }, + { + "epoch": 1.2001017682228725, + "grad_norm": 1.5645233392715454, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8736692070960999, + "num_tokens": 359990303.0, + "step": 9434 + }, + { + "epoch": 1.200228978501463, + "grad_norm": 1.5740063190460205, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8675657510757446, + "num_tokens": 360028578.0, + "step": 9435 + }, + { + "epoch": 1.2003561887800533, + "grad_norm": 1.490604043006897, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8833326697349548, + "num_tokens": 360066196.0, + "step": 9436 + }, + { + "epoch": 1.2004833990586439, + "grad_norm": 1.8933517932891846, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8767276406288147, + "num_tokens": 360098906.0, + "step": 9437 + }, + { + "epoch": 1.2006106093372344, + "grad_norm": 1.563542366027832, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8832235336303711, + "num_tokens": 360133219.0, + "step": 9438 + }, + { + "epoch": 1.200737819615825, + "grad_norm": 1.5758318901062012, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8716089725494385, + "num_tokens": 360168656.0, + "step": 9439 + }, + { + "epoch": 1.2008650298944155, + "grad_norm": 1.4945851564407349, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8755839467048645, + "num_tokens": 360206550.0, + "step": 9440 + }, + { + "epoch": 1.200992240173006, + "grad_norm": 1.5494271516799927, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8701832294464111, + "num_tokens": 360243133.0, + "step": 9441 + }, + { + "epoch": 1.2011194504515965, + "grad_norm": 1.4316661357879639, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8794546127319336, + "num_tokens": 360281830.0, + "step": 9442 + }, + { + "epoch": 1.201246660730187, + "grad_norm": 1.6271448135375977, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8701100945472717, + "num_tokens": 360321720.0, + "step": 9443 + }, + { + "epoch": 1.2013738710087776, + "grad_norm": 1.4060951471328735, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8756693005561829, + "num_tokens": 360367760.0, + "step": 9444 + }, + { + "epoch": 1.201501081287368, + "grad_norm": 1.587836742401123, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8743383288383484, + "num_tokens": 360403035.0, + "step": 9445 + }, + { + "epoch": 1.2016282915659584, + "grad_norm": 1.6063530445098877, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8731418251991272, + "num_tokens": 360438766.0, + "step": 9446 + }, + { + "epoch": 1.201755501844549, + "grad_norm": 1.6436575651168823, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8677421808242798, + "num_tokens": 360471656.0, + "step": 9447 + }, + { + "epoch": 1.2018827121231395, + "grad_norm": 1.7387189865112305, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8512120246887207, + "num_tokens": 360504368.0, + "step": 9448 + }, + { + "epoch": 1.20200992240173, + "grad_norm": 1.4322587251663208, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.876474142074585, + "num_tokens": 360545734.0, + "step": 9449 + }, + { + "epoch": 1.2021371326803205, + "grad_norm": 1.5399503707885742, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8763135075569153, + "num_tokens": 360578765.0, + "step": 9450 + }, + { + "epoch": 1.202264342958911, + "grad_norm": 1.5381187200546265, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8724963665008545, + "num_tokens": 360618681.0, + "step": 9451 + }, + { + "epoch": 1.2023915532375016, + "grad_norm": 1.4914779663085938, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.865588903427124, + "num_tokens": 360657540.0, + "step": 9452 + }, + { + "epoch": 1.202518763516092, + "grad_norm": 1.5340567827224731, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8713369369506836, + "num_tokens": 360696788.0, + "step": 9453 + }, + { + "epoch": 1.2026459737946826, + "grad_norm": 1.5003533363342285, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.891453206539154, + "num_tokens": 360733688.0, + "step": 9454 + }, + { + "epoch": 1.2027731840732732, + "grad_norm": 1.4603877067565918, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8824942111968994, + "num_tokens": 360767719.0, + "step": 9455 + }, + { + "epoch": 1.2029003943518637, + "grad_norm": 1.6212846040725708, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.865638792514801, + "num_tokens": 360802893.0, + "step": 9456 + }, + { + "epoch": 1.2030276046304542, + "grad_norm": 1.6019014120101929, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8642727732658386, + "num_tokens": 360838637.0, + "step": 9457 + }, + { + "epoch": 1.2031548149090447, + "grad_norm": 1.5643987655639648, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8807065486907959, + "num_tokens": 360874477.0, + "step": 9458 + }, + { + "epoch": 1.2032820251876353, + "grad_norm": 1.5359634160995483, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8768928050994873, + "num_tokens": 360909234.0, + "step": 9459 + }, + { + "epoch": 1.2034092354662256, + "grad_norm": 1.4338420629501343, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.886567234992981, + "num_tokens": 360948572.0, + "step": 9460 + }, + { + "epoch": 1.203536445744816, + "grad_norm": 1.5237538814544678, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8838422298431396, + "num_tokens": 360984033.0, + "step": 9461 + }, + { + "epoch": 1.2036636560234066, + "grad_norm": 1.5361438989639282, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8645815849304199, + "num_tokens": 361021027.0, + "step": 9462 + }, + { + "epoch": 1.2037908663019972, + "grad_norm": 1.4735363721847534, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8903981447219849, + "num_tokens": 361061801.0, + "step": 9463 + }, + { + "epoch": 1.2039180765805877, + "grad_norm": 1.608420729637146, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8824553489685059, + "num_tokens": 361098698.0, + "step": 9464 + }, + { + "epoch": 1.2040452868591782, + "grad_norm": 1.5820832252502441, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8744219541549683, + "num_tokens": 361136612.0, + "step": 9465 + }, + { + "epoch": 1.2041724971377687, + "grad_norm": 1.4970080852508545, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8848960995674133, + "num_tokens": 361172359.0, + "step": 9466 + }, + { + "epoch": 1.2042997074163593, + "grad_norm": 1.4391072988510132, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8989056348800659, + "num_tokens": 361210668.0, + "step": 9467 + }, + { + "epoch": 1.2044269176949498, + "grad_norm": 1.5836509466171265, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8657466173171997, + "num_tokens": 361250661.0, + "step": 9468 + }, + { + "epoch": 1.2045541279735403, + "grad_norm": 1.4643863439559937, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.877661406993866, + "num_tokens": 361289090.0, + "step": 9469 + }, + { + "epoch": 1.2046813382521309, + "grad_norm": 1.5043306350708008, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8694877028465271, + "num_tokens": 361326952.0, + "step": 9470 + }, + { + "epoch": 1.2048085485307212, + "grad_norm": 1.5232768058776855, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8912085890769958, + "num_tokens": 361360226.0, + "step": 9471 + }, + { + "epoch": 1.2049357588093117, + "grad_norm": 1.3758924007415771, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8878393173217773, + "num_tokens": 361401986.0, + "step": 9472 + }, + { + "epoch": 1.2050629690879022, + "grad_norm": 1.4758349657058716, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8820456862449646, + "num_tokens": 361442419.0, + "step": 9473 + }, + { + "epoch": 1.2051901793664928, + "grad_norm": 1.46152663230896, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8805294632911682, + "num_tokens": 361483591.0, + "step": 9474 + }, + { + "epoch": 1.2053173896450833, + "grad_norm": 1.5135653018951416, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8808761835098267, + "num_tokens": 361518923.0, + "step": 9475 + }, + { + "epoch": 1.2054445999236738, + "grad_norm": 1.5299386978149414, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8784276843070984, + "num_tokens": 361554797.0, + "step": 9476 + }, + { + "epoch": 1.2055718102022643, + "grad_norm": 1.549126148223877, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8786852359771729, + "num_tokens": 361590652.0, + "step": 9477 + }, + { + "epoch": 1.2056990204808549, + "grad_norm": 1.427656888961792, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8694144487380981, + "num_tokens": 361632995.0, + "step": 9478 + }, + { + "epoch": 1.2058262307594454, + "grad_norm": 1.4459949731826782, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8900524377822876, + "num_tokens": 361669594.0, + "step": 9479 + }, + { + "epoch": 1.205953441038036, + "grad_norm": 1.564936637878418, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8710439205169678, + "num_tokens": 361709651.0, + "step": 9480 + }, + { + "epoch": 1.2060806513166265, + "grad_norm": 1.4859294891357422, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8633500337600708, + "num_tokens": 361750582.0, + "step": 9481 + }, + { + "epoch": 1.206207861595217, + "grad_norm": 1.4375412464141846, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8696271181106567, + "num_tokens": 361795217.0, + "step": 9482 + }, + { + "epoch": 1.2063350718738075, + "grad_norm": 1.522308349609375, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8800714612007141, + "num_tokens": 361832152.0, + "step": 9483 + }, + { + "epoch": 1.206462282152398, + "grad_norm": 1.60673987865448, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8564605712890625, + "num_tokens": 361872762.0, + "step": 9484 + }, + { + "epoch": 1.2065894924309883, + "grad_norm": 1.4115647077560425, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8857288360595703, + "num_tokens": 361916845.0, + "step": 9485 + }, + { + "epoch": 1.2067167027095789, + "grad_norm": 1.6296674013137817, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8759016990661621, + "num_tokens": 361951853.0, + "step": 9486 + }, + { + "epoch": 1.2068439129881694, + "grad_norm": 1.5098241567611694, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8692975044250488, + "num_tokens": 361992282.0, + "step": 9487 + }, + { + "epoch": 1.20697112326676, + "grad_norm": 1.7006845474243164, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8686869740486145, + "num_tokens": 362022824.0, + "step": 9488 + }, + { + "epoch": 1.2070983335453505, + "grad_norm": 1.520060658454895, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8898450136184692, + "num_tokens": 362058186.0, + "step": 9489 + }, + { + "epoch": 1.207225543823941, + "grad_norm": 1.3572982549667358, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8879916667938232, + "num_tokens": 362099579.0, + "step": 9490 + }, + { + "epoch": 1.2073527541025315, + "grad_norm": 1.7369333505630493, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8692106008529663, + "num_tokens": 362133174.0, + "step": 9491 + }, + { + "epoch": 1.207479964381122, + "grad_norm": 1.4771660566329956, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8836967945098877, + "num_tokens": 362172823.0, + "step": 9492 + }, + { + "epoch": 1.2076071746597126, + "grad_norm": 1.6104450225830078, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8702911138534546, + "num_tokens": 362209754.0, + "step": 9493 + }, + { + "epoch": 1.207734384938303, + "grad_norm": 1.3913273811340332, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8811601400375366, + "num_tokens": 362253065.0, + "step": 9494 + }, + { + "epoch": 1.2078615952168934, + "grad_norm": 1.5661537647247314, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.878362774848938, + "num_tokens": 362291067.0, + "step": 9495 + }, + { + "epoch": 1.207988805495484, + "grad_norm": 1.5837290287017822, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8663057088851929, + "num_tokens": 362323233.0, + "step": 9496 + }, + { + "epoch": 1.2081160157740745, + "grad_norm": 1.602656602859497, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.856756329536438, + "num_tokens": 362365025.0, + "step": 9497 + }, + { + "epoch": 1.208243226052665, + "grad_norm": 1.7460967302322388, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8823559880256653, + "num_tokens": 362396484.0, + "step": 9498 + }, + { + "epoch": 1.2083704363312555, + "grad_norm": 1.3702892065048218, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8731628656387329, + "num_tokens": 362440484.0, + "step": 9499 + }, + { + "epoch": 1.208497646609846, + "grad_norm": 1.4729464054107666, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8754635453224182, + "num_tokens": 362480525.0, + "step": 9500 + }, + { + "epoch": 1.2086248568884366, + "grad_norm": 1.4844311475753784, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8715878129005432, + "num_tokens": 362518231.0, + "step": 9501 + }, + { + "epoch": 1.208752067167027, + "grad_norm": 1.5351186990737915, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8823642730712891, + "num_tokens": 362555805.0, + "step": 9502 + }, + { + "epoch": 1.2088792774456176, + "grad_norm": 1.5406042337417603, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8510392904281616, + "num_tokens": 362597057.0, + "step": 9503 + }, + { + "epoch": 1.2090064877242082, + "grad_norm": 1.5875990390777588, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8714597225189209, + "num_tokens": 362635708.0, + "step": 9504 + }, + { + "epoch": 1.2091336980027987, + "grad_norm": 1.4516072273254395, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.871141791343689, + "num_tokens": 362676451.0, + "step": 9505 + }, + { + "epoch": 1.2092609082813892, + "grad_norm": 1.66510009765625, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8725312948226929, + "num_tokens": 362710641.0, + "step": 9506 + }, + { + "epoch": 1.2093881185599797, + "grad_norm": 1.5555037260055542, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8844080567359924, + "num_tokens": 362746545.0, + "step": 9507 + }, + { + "epoch": 1.2095153288385703, + "grad_norm": 1.4236241579055786, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8744410276412964, + "num_tokens": 362789268.0, + "step": 9508 + }, + { + "epoch": 1.2096425391171606, + "grad_norm": 1.2858165502548218, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8868106603622437, + "num_tokens": 362833962.0, + "step": 9509 + }, + { + "epoch": 1.209769749395751, + "grad_norm": 1.4263432025909424, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.876257061958313, + "num_tokens": 362875102.0, + "step": 9510 + }, + { + "epoch": 1.2098969596743416, + "grad_norm": 1.5267856121063232, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8771643042564392, + "num_tokens": 362911721.0, + "step": 9511 + }, + { + "epoch": 1.2100241699529322, + "grad_norm": 1.5023188591003418, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8731402158737183, + "num_tokens": 362949756.0, + "step": 9512 + }, + { + "epoch": 1.2101513802315227, + "grad_norm": 1.6428338289260864, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8661113977432251, + "num_tokens": 362983179.0, + "step": 9513 + }, + { + "epoch": 1.2102785905101132, + "grad_norm": 1.4798861742019653, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.867210865020752, + "num_tokens": 363023154.0, + "step": 9514 + }, + { + "epoch": 1.2104058007887037, + "grad_norm": 1.4622615575790405, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8730921745300293, + "num_tokens": 363063351.0, + "step": 9515 + }, + { + "epoch": 1.2105330110672943, + "grad_norm": 1.485048532485962, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8732302188873291, + "num_tokens": 363101393.0, + "step": 9516 + }, + { + "epoch": 1.2106602213458848, + "grad_norm": 1.6025323867797852, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8546838760375977, + "num_tokens": 363141365.0, + "step": 9517 + }, + { + "epoch": 1.2107874316244753, + "grad_norm": 1.4927271604537964, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8906542658805847, + "num_tokens": 363177422.0, + "step": 9518 + }, + { + "epoch": 1.2109146419030659, + "grad_norm": 1.6315698623657227, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8669699430465698, + "num_tokens": 363210969.0, + "step": 9519 + }, + { + "epoch": 1.2110418521816562, + "grad_norm": 1.5075920820236206, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8735647201538086, + "num_tokens": 363248210.0, + "step": 9520 + }, + { + "epoch": 1.2111690624602467, + "grad_norm": 1.5740934610366821, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8839128613471985, + "num_tokens": 363283992.0, + "step": 9521 + }, + { + "epoch": 1.2112962727388372, + "grad_norm": 1.585830569267273, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8740339279174805, + "num_tokens": 363319121.0, + "step": 9522 + }, + { + "epoch": 1.2114234830174277, + "grad_norm": 1.4620083570480347, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8810338973999023, + "num_tokens": 363356880.0, + "step": 9523 + }, + { + "epoch": 1.2115506932960183, + "grad_norm": 1.3550059795379639, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.8969390392303467, + "num_tokens": 363395981.0, + "step": 9524 + }, + { + "epoch": 1.2116779035746088, + "grad_norm": 1.6186126470565796, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.866280198097229, + "num_tokens": 363432334.0, + "step": 9525 + }, + { + "epoch": 1.2118051138531993, + "grad_norm": 1.5641260147094727, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8924601078033447, + "num_tokens": 363466970.0, + "step": 9526 + }, + { + "epoch": 1.2119323241317899, + "grad_norm": 1.5678716897964478, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8789430856704712, + "num_tokens": 363500405.0, + "step": 9527 + }, + { + "epoch": 1.2120595344103804, + "grad_norm": 1.737143635749817, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8708614706993103, + "num_tokens": 363530264.0, + "step": 9528 + }, + { + "epoch": 1.212186744688971, + "grad_norm": 1.3775774240493774, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8755077123641968, + "num_tokens": 363578626.0, + "step": 9529 + }, + { + "epoch": 1.2123139549675614, + "grad_norm": 1.511828064918518, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8678125739097595, + "num_tokens": 363617500.0, + "step": 9530 + }, + { + "epoch": 1.212441165246152, + "grad_norm": 1.5526485443115234, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8824237585067749, + "num_tokens": 363654293.0, + "step": 9531 + }, + { + "epoch": 1.2125683755247425, + "grad_norm": 1.4540584087371826, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8775876760482788, + "num_tokens": 363690998.0, + "step": 9532 + }, + { + "epoch": 1.212695585803333, + "grad_norm": 1.5938235521316528, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.869539201259613, + "num_tokens": 363726832.0, + "step": 9533 + }, + { + "epoch": 1.2128227960819233, + "grad_norm": 1.4997715950012207, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8804516196250916, + "num_tokens": 363765262.0, + "step": 9534 + }, + { + "epoch": 1.2129500063605139, + "grad_norm": 1.5607649087905884, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8577151298522949, + "num_tokens": 363808095.0, + "step": 9535 + }, + { + "epoch": 1.2130772166391044, + "grad_norm": 1.4313874244689941, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8775593042373657, + "num_tokens": 363848400.0, + "step": 9536 + }, + { + "epoch": 1.213204426917695, + "grad_norm": 1.598081350326538, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8685382008552551, + "num_tokens": 363882478.0, + "step": 9537 + }, + { + "epoch": 1.2133316371962855, + "grad_norm": 1.34001624584198, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8949848413467407, + "num_tokens": 363921364.0, + "step": 9538 + }, + { + "epoch": 1.213458847474876, + "grad_norm": 1.4565343856811523, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8731561303138733, + "num_tokens": 363963022.0, + "step": 9539 + }, + { + "epoch": 1.2135860577534665, + "grad_norm": 1.5297855138778687, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8614037036895752, + "num_tokens": 364002631.0, + "step": 9540 + }, + { + "epoch": 1.213713268032057, + "grad_norm": 1.7240729331970215, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8758145570755005, + "num_tokens": 364034762.0, + "step": 9541 + }, + { + "epoch": 1.2138404783106476, + "grad_norm": 1.4204432964324951, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8811159133911133, + "num_tokens": 364077147.0, + "step": 9542 + }, + { + "epoch": 1.213967688589238, + "grad_norm": 1.577533483505249, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8598794937133789, + "num_tokens": 364117008.0, + "step": 9543 + }, + { + "epoch": 1.2140948988678284, + "grad_norm": 1.5198646783828735, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8635035753250122, + "num_tokens": 364156875.0, + "step": 9544 + }, + { + "epoch": 1.214222109146419, + "grad_norm": 1.424323558807373, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.885774552822113, + "num_tokens": 364196171.0, + "step": 9545 + }, + { + "epoch": 1.2143493194250095, + "grad_norm": 1.422135591506958, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.886555016040802, + "num_tokens": 364239713.0, + "step": 9546 + }, + { + "epoch": 1.2144765297036, + "grad_norm": 1.4556442499160767, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8800143003463745, + "num_tokens": 364280970.0, + "step": 9547 + }, + { + "epoch": 1.2146037399821905, + "grad_norm": 1.5313574075698853, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8715642690658569, + "num_tokens": 364315601.0, + "step": 9548 + }, + { + "epoch": 1.214730950260781, + "grad_norm": 1.489646077156067, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.881174623966217, + "num_tokens": 364352700.0, + "step": 9549 + }, + { + "epoch": 1.2148581605393716, + "grad_norm": 1.4045261144638062, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8727641105651855, + "num_tokens": 364397150.0, + "step": 9550 + }, + { + "epoch": 1.214985370817962, + "grad_norm": 1.5074979066848755, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8692504167556763, + "num_tokens": 364438910.0, + "step": 9551 + }, + { + "epoch": 1.2151125810965526, + "grad_norm": 1.4437592029571533, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8674646019935608, + "num_tokens": 364479436.0, + "step": 9552 + }, + { + "epoch": 1.2152397913751432, + "grad_norm": 1.394916296005249, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8669791221618652, + "num_tokens": 364525284.0, + "step": 9553 + }, + { + "epoch": 1.2153670016537337, + "grad_norm": 1.4493621587753296, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.880538821220398, + "num_tokens": 364567259.0, + "step": 9554 + }, + { + "epoch": 1.2154942119323242, + "grad_norm": 1.5098586082458496, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8777702450752258, + "num_tokens": 364603596.0, + "step": 9555 + }, + { + "epoch": 1.2156214222109147, + "grad_norm": 1.4986541271209717, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8726744055747986, + "num_tokens": 364642434.0, + "step": 9556 + }, + { + "epoch": 1.2157486324895053, + "grad_norm": 1.5469120740890503, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8665238618850708, + "num_tokens": 364683359.0, + "step": 9557 + }, + { + "epoch": 1.2158758427680956, + "grad_norm": 1.5240368843078613, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8689274787902832, + "num_tokens": 364725747.0, + "step": 9558 + }, + { + "epoch": 1.216003053046686, + "grad_norm": 1.559861421585083, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8835425972938538, + "num_tokens": 364765122.0, + "step": 9559 + }, + { + "epoch": 1.2161302633252766, + "grad_norm": 1.5037771463394165, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8609715104103088, + "num_tokens": 364809478.0, + "step": 9560 + }, + { + "epoch": 1.2162574736038672, + "grad_norm": 1.4741261005401611, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.879700779914856, + "num_tokens": 364849242.0, + "step": 9561 + }, + { + "epoch": 1.2163846838824577, + "grad_norm": 1.5081779956817627, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.867469847202301, + "num_tokens": 364889326.0, + "step": 9562 + }, + { + "epoch": 1.2165118941610482, + "grad_norm": 1.5404149293899536, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8706145286560059, + "num_tokens": 364926199.0, + "step": 9563 + }, + { + "epoch": 1.2166391044396387, + "grad_norm": 1.529914140701294, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8607589602470398, + "num_tokens": 364966880.0, + "step": 9564 + }, + { + "epoch": 1.2167663147182293, + "grad_norm": 1.6296961307525635, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8642171025276184, + "num_tokens": 365002492.0, + "step": 9565 + }, + { + "epoch": 1.2168935249968198, + "grad_norm": 1.3799196481704712, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8753235340118408, + "num_tokens": 365047497.0, + "step": 9566 + }, + { + "epoch": 1.2170207352754103, + "grad_norm": 1.6638797521591187, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8587879538536072, + "num_tokens": 365078942.0, + "step": 9567 + }, + { + "epoch": 1.2171479455540009, + "grad_norm": 1.712178349494934, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8714730143547058, + "num_tokens": 365110405.0, + "step": 9568 + }, + { + "epoch": 1.2172751558325912, + "grad_norm": 1.4516119956970215, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8855608701705933, + "num_tokens": 365145720.0, + "step": 9569 + }, + { + "epoch": 1.2174023661111817, + "grad_norm": 1.4605575799942017, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8647699356079102, + "num_tokens": 365185533.0, + "step": 9570 + }, + { + "epoch": 1.2175295763897722, + "grad_norm": 1.4041274785995483, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8637549877166748, + "num_tokens": 365232597.0, + "step": 9571 + }, + { + "epoch": 1.2176567866683627, + "grad_norm": 1.6508229970932007, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8550688028335571, + "num_tokens": 365268640.0, + "step": 9572 + }, + { + "epoch": 1.2177839969469533, + "grad_norm": 1.5264654159545898, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8833782076835632, + "num_tokens": 365303624.0, + "step": 9573 + }, + { + "epoch": 1.2179112072255438, + "grad_norm": 1.3335620164871216, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8805697560310364, + "num_tokens": 365349563.0, + "step": 9574 + }, + { + "epoch": 1.2180384175041343, + "grad_norm": 1.5403283834457397, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.866615891456604, + "num_tokens": 365387945.0, + "step": 9575 + }, + { + "epoch": 1.2181656277827249, + "grad_norm": 1.46393620967865, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8932444453239441, + "num_tokens": 365422771.0, + "step": 9576 + }, + { + "epoch": 1.2182928380613154, + "grad_norm": 1.7745945453643799, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.873215913772583, + "num_tokens": 365450745.0, + "step": 9577 + }, + { + "epoch": 1.218420048339906, + "grad_norm": 1.5234423875808716, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8761706948280334, + "num_tokens": 365490271.0, + "step": 9578 + }, + { + "epoch": 1.2185472586184964, + "grad_norm": 1.4644323587417603, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.871356189250946, + "num_tokens": 365533642.0, + "step": 9579 + }, + { + "epoch": 1.218674468897087, + "grad_norm": 1.426161527633667, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8816558718681335, + "num_tokens": 365572541.0, + "step": 9580 + }, + { + "epoch": 1.2188016791756775, + "grad_norm": 1.478686809539795, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8771762251853943, + "num_tokens": 365612337.0, + "step": 9581 + }, + { + "epoch": 1.218928889454268, + "grad_norm": 1.5701589584350586, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8704814910888672, + "num_tokens": 365649817.0, + "step": 9582 + }, + { + "epoch": 1.2190560997328583, + "grad_norm": 1.5466989278793335, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8726545572280884, + "num_tokens": 365685803.0, + "step": 9583 + }, + { + "epoch": 1.2191833100114489, + "grad_norm": 1.6249394416809082, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.868049144744873, + "num_tokens": 365721802.0, + "step": 9584 + }, + { + "epoch": 1.2193105202900394, + "grad_norm": 1.5084738731384277, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8841353058815002, + "num_tokens": 365755597.0, + "step": 9585 + }, + { + "epoch": 1.21943773056863, + "grad_norm": 1.4367684125900269, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8788630962371826, + "num_tokens": 365795925.0, + "step": 9586 + }, + { + "epoch": 1.2195649408472204, + "grad_norm": 1.457252025604248, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8794445395469666, + "num_tokens": 365839896.0, + "step": 9587 + }, + { + "epoch": 1.219692151125811, + "grad_norm": 1.6107076406478882, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8794198632240295, + "num_tokens": 365874118.0, + "step": 9588 + }, + { + "epoch": 1.2198193614044015, + "grad_norm": 1.5367457866668701, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.875041127204895, + "num_tokens": 365911949.0, + "step": 9589 + }, + { + "epoch": 1.219946571682992, + "grad_norm": 1.5067089796066284, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8742929100990295, + "num_tokens": 365953832.0, + "step": 9590 + }, + { + "epoch": 1.2200737819615826, + "grad_norm": 1.5935248136520386, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8741154670715332, + "num_tokens": 365992212.0, + "step": 9591 + }, + { + "epoch": 1.220200992240173, + "grad_norm": 1.3760734796524048, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8717502355575562, + "num_tokens": 366037893.0, + "step": 9592 + }, + { + "epoch": 1.2203282025187634, + "grad_norm": 1.6014599800109863, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8881024718284607, + "num_tokens": 366076874.0, + "step": 9593 + }, + { + "epoch": 1.220455412797354, + "grad_norm": 1.55690336227417, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.883841335773468, + "num_tokens": 366112568.0, + "step": 9594 + }, + { + "epoch": 1.2205826230759445, + "grad_norm": 1.4132812023162842, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8829152584075928, + "num_tokens": 366150134.0, + "step": 9595 + }, + { + "epoch": 1.220709833354535, + "grad_norm": 1.6029647588729858, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8574744462966919, + "num_tokens": 366190406.0, + "step": 9596 + }, + { + "epoch": 1.2208370436331255, + "grad_norm": 1.667291522026062, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8620664477348328, + "num_tokens": 366231723.0, + "step": 9597 + }, + { + "epoch": 1.220964253911716, + "grad_norm": 1.5652254819869995, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8648558259010315, + "num_tokens": 366271846.0, + "step": 9598 + }, + { + "epoch": 1.2210914641903066, + "grad_norm": 2.217113733291626, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.870022177696228, + "num_tokens": 366300909.0, + "step": 9599 + }, + { + "epoch": 1.221218674468897, + "grad_norm": 1.4685062170028687, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.877233624458313, + "num_tokens": 366341504.0, + "step": 9600 + }, + { + "epoch": 1.2213458847474876, + "grad_norm": 1.4942359924316406, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8679677844047546, + "num_tokens": 366382231.0, + "step": 9601 + }, + { + "epoch": 1.2214730950260781, + "grad_norm": 1.3574013710021973, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.890944242477417, + "num_tokens": 366427441.0, + "step": 9602 + }, + { + "epoch": 1.2216003053046687, + "grad_norm": 1.5050665140151978, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8763401508331299, + "num_tokens": 366466427.0, + "step": 9603 + }, + { + "epoch": 1.2217275155832592, + "grad_norm": 1.4268752336502075, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8822004199028015, + "num_tokens": 366508779.0, + "step": 9604 + }, + { + "epoch": 1.2218547258618497, + "grad_norm": 1.5954898595809937, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8762959241867065, + "num_tokens": 366541733.0, + "step": 9605 + }, + { + "epoch": 1.2219819361404403, + "grad_norm": 1.5431149005889893, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8718061447143555, + "num_tokens": 366580544.0, + "step": 9606 + }, + { + "epoch": 1.2221091464190306, + "grad_norm": 1.5314512252807617, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8941227197647095, + "num_tokens": 366614742.0, + "step": 9607 + }, + { + "epoch": 1.222236356697621, + "grad_norm": 1.5079940557479858, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.888197660446167, + "num_tokens": 366652268.0, + "step": 9608 + }, + { + "epoch": 1.2223635669762116, + "grad_norm": 1.5715631246566772, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.874407172203064, + "num_tokens": 366690733.0, + "step": 9609 + }, + { + "epoch": 1.2224907772548022, + "grad_norm": 1.4501396417617798, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8898962140083313, + "num_tokens": 366730311.0, + "step": 9610 + }, + { + "epoch": 1.2226179875333927, + "grad_norm": 1.6385226249694824, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8782159090042114, + "num_tokens": 366765237.0, + "step": 9611 + }, + { + "epoch": 1.2227451978119832, + "grad_norm": 1.6112202405929565, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8654953241348267, + "num_tokens": 366802984.0, + "step": 9612 + }, + { + "epoch": 1.2228724080905737, + "grad_norm": 1.5210477113723755, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.873741626739502, + "num_tokens": 366840550.0, + "step": 9613 + }, + { + "epoch": 1.2229996183691643, + "grad_norm": 1.6058290004730225, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8732194900512695, + "num_tokens": 366874921.0, + "step": 9614 + }, + { + "epoch": 1.2231268286477548, + "grad_norm": 1.4731162786483765, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8836636543273926, + "num_tokens": 366914007.0, + "step": 9615 + }, + { + "epoch": 1.2232540389263453, + "grad_norm": 1.4506129026412964, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.884733259677887, + "num_tokens": 366956760.0, + "step": 9616 + }, + { + "epoch": 1.2233812492049359, + "grad_norm": 1.395293951034546, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8895325660705566, + "num_tokens": 366999554.0, + "step": 9617 + }, + { + "epoch": 1.2235084594835262, + "grad_norm": 1.4868091344833374, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8758192658424377, + "num_tokens": 367039368.0, + "step": 9618 + }, + { + "epoch": 1.2236356697621167, + "grad_norm": 1.446750521659851, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8778456449508667, + "num_tokens": 367081812.0, + "step": 9619 + }, + { + "epoch": 1.2237628800407072, + "grad_norm": 1.3936660289764404, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8853923678398132, + "num_tokens": 367124063.0, + "step": 9620 + }, + { + "epoch": 1.2238900903192977, + "grad_norm": 1.5133227109909058, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.867681086063385, + "num_tokens": 367160800.0, + "step": 9621 + }, + { + "epoch": 1.2240173005978883, + "grad_norm": 1.653847098350525, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.859530508518219, + "num_tokens": 367197669.0, + "step": 9622 + }, + { + "epoch": 1.2241445108764788, + "grad_norm": 1.6112101078033447, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8811250925064087, + "num_tokens": 367233465.0, + "step": 9623 + }, + { + "epoch": 1.2242717211550693, + "grad_norm": 1.6831285953521729, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8696190714836121, + "num_tokens": 367264694.0, + "step": 9624 + }, + { + "epoch": 1.2243989314336599, + "grad_norm": 1.676857590675354, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8729456663131714, + "num_tokens": 367298912.0, + "step": 9625 + }, + { + "epoch": 1.2245261417122504, + "grad_norm": 1.4826347827911377, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8787692785263062, + "num_tokens": 367339000.0, + "step": 9626 + }, + { + "epoch": 1.224653351990841, + "grad_norm": 1.526329517364502, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8699136972427368, + "num_tokens": 367380008.0, + "step": 9627 + }, + { + "epoch": 1.2247805622694314, + "grad_norm": 1.5496751070022583, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8729833960533142, + "num_tokens": 367416144.0, + "step": 9628 + }, + { + "epoch": 1.224907772548022, + "grad_norm": 1.5173834562301636, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8697143793106079, + "num_tokens": 367456595.0, + "step": 9629 + }, + { + "epoch": 1.2250349828266125, + "grad_norm": 1.5530695915222168, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8792259693145752, + "num_tokens": 367489836.0, + "step": 9630 + }, + { + "epoch": 1.225162193105203, + "grad_norm": 1.5774787664413452, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8673666715621948, + "num_tokens": 367528834.0, + "step": 9631 + }, + { + "epoch": 1.2252894033837933, + "grad_norm": 1.5106265544891357, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8749389052391052, + "num_tokens": 367564728.0, + "step": 9632 + }, + { + "epoch": 1.2254166136623839, + "grad_norm": 1.6101425886154175, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8660698533058167, + "num_tokens": 367599314.0, + "step": 9633 + }, + { + "epoch": 1.2255438239409744, + "grad_norm": 1.4893332719802856, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8692125082015991, + "num_tokens": 367641275.0, + "step": 9634 + }, + { + "epoch": 1.225671034219565, + "grad_norm": 1.4704482555389404, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8817118406295776, + "num_tokens": 367683391.0, + "step": 9635 + }, + { + "epoch": 1.2257982444981554, + "grad_norm": 1.5419772863388062, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8738263845443726, + "num_tokens": 367719668.0, + "step": 9636 + }, + { + "epoch": 1.225925454776746, + "grad_norm": 1.4727327823638916, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8798115849494934, + "num_tokens": 367758796.0, + "step": 9637 + }, + { + "epoch": 1.2260526650553365, + "grad_norm": 1.5555826425552368, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8725833892822266, + "num_tokens": 367795104.0, + "step": 9638 + }, + { + "epoch": 1.226179875333927, + "grad_norm": 1.5163863897323608, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8660041093826294, + "num_tokens": 367836486.0, + "step": 9639 + }, + { + "epoch": 1.2263070856125176, + "grad_norm": 1.6585216522216797, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8514806628227234, + "num_tokens": 367872361.0, + "step": 9640 + }, + { + "epoch": 1.226434295891108, + "grad_norm": 1.6591966152191162, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8755307793617249, + "num_tokens": 367904339.0, + "step": 9641 + }, + { + "epoch": 1.2265615061696984, + "grad_norm": 1.6405564546585083, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8648029565811157, + "num_tokens": 367937438.0, + "step": 9642 + }, + { + "epoch": 1.226688716448289, + "grad_norm": 1.4480475187301636, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.888247013092041, + "num_tokens": 367976262.0, + "step": 9643 + }, + { + "epoch": 1.2268159267268794, + "grad_norm": 1.4970300197601318, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.867518961429596, + "num_tokens": 368019708.0, + "step": 9644 + }, + { + "epoch": 1.22694313700547, + "grad_norm": 1.576509952545166, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8696250915527344, + "num_tokens": 368058801.0, + "step": 9645 + }, + { + "epoch": 1.2270703472840605, + "grad_norm": 1.422898292541504, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8837619423866272, + "num_tokens": 368098450.0, + "step": 9646 + }, + { + "epoch": 1.227197557562651, + "grad_norm": 1.5448731184005737, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.870690643787384, + "num_tokens": 368137734.0, + "step": 9647 + }, + { + "epoch": 1.2273247678412416, + "grad_norm": 1.6108801364898682, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8696570992469788, + "num_tokens": 368173281.0, + "step": 9648 + }, + { + "epoch": 1.227451978119832, + "grad_norm": 1.4961899518966675, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8810214400291443, + "num_tokens": 368213615.0, + "step": 9649 + }, + { + "epoch": 1.2275791883984226, + "grad_norm": 1.46824049949646, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8882108926773071, + "num_tokens": 368249688.0, + "step": 9650 + }, + { + "epoch": 1.2277063986770131, + "grad_norm": 1.4413065910339355, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8782050609588623, + "num_tokens": 368292914.0, + "step": 9651 + }, + { + "epoch": 1.2278336089556037, + "grad_norm": 1.6478575468063354, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8546944260597229, + "num_tokens": 368333397.0, + "step": 9652 + }, + { + "epoch": 1.2279608192341942, + "grad_norm": 1.6168203353881836, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8782143592834473, + "num_tokens": 368367613.0, + "step": 9653 + }, + { + "epoch": 1.2280880295127847, + "grad_norm": 1.5671908855438232, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8680304288864136, + "num_tokens": 368407854.0, + "step": 9654 + }, + { + "epoch": 1.2282152397913753, + "grad_norm": 1.6459121704101562, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8820341229438782, + "num_tokens": 368442156.0, + "step": 9655 + }, + { + "epoch": 1.2283424500699656, + "grad_norm": 1.4816116094589233, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8771200180053711, + "num_tokens": 368481617.0, + "step": 9656 + }, + { + "epoch": 1.228469660348556, + "grad_norm": 1.405767798423767, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8872220516204834, + "num_tokens": 368523603.0, + "step": 9657 + }, + { + "epoch": 1.2285968706271466, + "grad_norm": 1.5389928817749023, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8791922926902771, + "num_tokens": 368564775.0, + "step": 9658 + }, + { + "epoch": 1.2287240809057371, + "grad_norm": 1.6347583532333374, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8524538278579712, + "num_tokens": 368606752.0, + "step": 9659 + }, + { + "epoch": 1.2288512911843277, + "grad_norm": 1.5559552907943726, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8799419403076172, + "num_tokens": 368641801.0, + "step": 9660 + }, + { + "epoch": 1.2289785014629182, + "grad_norm": 1.5184835195541382, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8785324692726135, + "num_tokens": 368676039.0, + "step": 9661 + }, + { + "epoch": 1.2291057117415087, + "grad_norm": 1.431201457977295, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8779305219650269, + "num_tokens": 368715050.0, + "step": 9662 + }, + { + "epoch": 1.2292329220200993, + "grad_norm": 1.4700318574905396, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8942843675613403, + "num_tokens": 368750827.0, + "step": 9663 + }, + { + "epoch": 1.2293601322986898, + "grad_norm": 1.623523235321045, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8720150589942932, + "num_tokens": 368789359.0, + "step": 9664 + }, + { + "epoch": 1.2294873425772803, + "grad_norm": 1.6542935371398926, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8780912160873413, + "num_tokens": 368820194.0, + "step": 9665 + }, + { + "epoch": 1.2296145528558708, + "grad_norm": 1.6205333471298218, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8699374794960022, + "num_tokens": 368857670.0, + "step": 9666 + }, + { + "epoch": 1.2297417631344612, + "grad_norm": 1.4879693984985352, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8852411508560181, + "num_tokens": 368892848.0, + "step": 9667 + }, + { + "epoch": 1.2298689734130517, + "grad_norm": 1.5655303001403809, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.876549243927002, + "num_tokens": 368928692.0, + "step": 9668 + }, + { + "epoch": 1.2299961836916422, + "grad_norm": 1.610626459121704, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8597010374069214, + "num_tokens": 368967908.0, + "step": 9669 + }, + { + "epoch": 1.2301233939702327, + "grad_norm": 1.6277704238891602, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8674471378326416, + "num_tokens": 369006809.0, + "step": 9670 + }, + { + "epoch": 1.2302506042488233, + "grad_norm": 1.5281260013580322, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8819242715835571, + "num_tokens": 369044340.0, + "step": 9671 + }, + { + "epoch": 1.2303778145274138, + "grad_norm": 1.631762981414795, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8831048011779785, + "num_tokens": 369074655.0, + "step": 9672 + }, + { + "epoch": 1.2305050248060043, + "grad_norm": 1.6044015884399414, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8781967163085938, + "num_tokens": 369111485.0, + "step": 9673 + }, + { + "epoch": 1.2306322350845948, + "grad_norm": 1.4822252988815308, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8884322047233582, + "num_tokens": 369152428.0, + "step": 9674 + }, + { + "epoch": 1.2307594453631854, + "grad_norm": 1.561407208442688, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8854992985725403, + "num_tokens": 369186050.0, + "step": 9675 + }, + { + "epoch": 1.230886655641776, + "grad_norm": 1.4539902210235596, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8849822878837585, + "num_tokens": 369224017.0, + "step": 9676 + }, + { + "epoch": 1.2310138659203664, + "grad_norm": 1.5858275890350342, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8658690452575684, + "num_tokens": 369260588.0, + "step": 9677 + }, + { + "epoch": 1.231141076198957, + "grad_norm": 1.5967909097671509, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8865610361099243, + "num_tokens": 369295396.0, + "step": 9678 + }, + { + "epoch": 1.2312682864775475, + "grad_norm": 1.40980863571167, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8746610879898071, + "num_tokens": 369340915.0, + "step": 9679 + }, + { + "epoch": 1.231395496756138, + "grad_norm": 1.5433467626571655, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8759310245513916, + "num_tokens": 369379629.0, + "step": 9680 + }, + { + "epoch": 1.2315227070347283, + "grad_norm": 1.5330173969268799, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8805958032608032, + "num_tokens": 369417844.0, + "step": 9681 + }, + { + "epoch": 1.2316499173133189, + "grad_norm": 1.4942420721054077, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8727979063987732, + "num_tokens": 369457939.0, + "step": 9682 + }, + { + "epoch": 1.2317771275919094, + "grad_norm": 1.5745052099227905, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8581022024154663, + "num_tokens": 369498840.0, + "step": 9683 + }, + { + "epoch": 1.2319043378705, + "grad_norm": 1.450818419456482, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8653664588928223, + "num_tokens": 369539488.0, + "step": 9684 + }, + { + "epoch": 1.2320315481490904, + "grad_norm": 1.6563845872879028, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8688414692878723, + "num_tokens": 369573503.0, + "step": 9685 + }, + { + "epoch": 1.232158758427681, + "grad_norm": 1.3949713706970215, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8902676701545715, + "num_tokens": 369612423.0, + "step": 9686 + }, + { + "epoch": 1.2322859687062715, + "grad_norm": 1.5541247129440308, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.880131721496582, + "num_tokens": 369649735.0, + "step": 9687 + }, + { + "epoch": 1.232413178984862, + "grad_norm": 1.5811364650726318, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8658108115196228, + "num_tokens": 369688030.0, + "step": 9688 + }, + { + "epoch": 1.2325403892634526, + "grad_norm": 1.575270414352417, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8679429888725281, + "num_tokens": 369725540.0, + "step": 9689 + }, + { + "epoch": 1.232667599542043, + "grad_norm": 1.491101622581482, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8726629018783569, + "num_tokens": 369768991.0, + "step": 9690 + }, + { + "epoch": 1.2327948098206334, + "grad_norm": 1.5502344369888306, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8681942224502563, + "num_tokens": 369807195.0, + "step": 9691 + }, + { + "epoch": 1.232922020099224, + "grad_norm": 1.6701642274856567, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8738003373146057, + "num_tokens": 369837818.0, + "step": 9692 + }, + { + "epoch": 1.2330492303778144, + "grad_norm": 1.4679460525512695, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8585083484649658, + "num_tokens": 369882439.0, + "step": 9693 + }, + { + "epoch": 1.233176440656405, + "grad_norm": 1.4433207511901855, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8838291764259338, + "num_tokens": 369923222.0, + "step": 9694 + }, + { + "epoch": 1.2333036509349955, + "grad_norm": 1.4785914421081543, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8763151168823242, + "num_tokens": 369961761.0, + "step": 9695 + }, + { + "epoch": 1.233430861213586, + "grad_norm": 1.483607292175293, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8911839723587036, + "num_tokens": 369994988.0, + "step": 9696 + }, + { + "epoch": 1.2335580714921766, + "grad_norm": 1.5257108211517334, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8690721392631531, + "num_tokens": 370032231.0, + "step": 9697 + }, + { + "epoch": 1.233685281770767, + "grad_norm": 1.5880581140518188, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8667434453964233, + "num_tokens": 370074704.0, + "step": 9698 + }, + { + "epoch": 1.2338124920493576, + "grad_norm": 1.6879627704620361, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8797715902328491, + "num_tokens": 370109256.0, + "step": 9699 + }, + { + "epoch": 1.2339397023279481, + "grad_norm": 1.3774876594543457, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.87673020362854, + "num_tokens": 370150512.0, + "step": 9700 + }, + { + "epoch": 1.2340669126065387, + "grad_norm": 1.5795289278030396, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8745227456092834, + "num_tokens": 370185477.0, + "step": 9701 + }, + { + "epoch": 1.2341941228851292, + "grad_norm": 1.7406800985336304, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8739138245582581, + "num_tokens": 370214578.0, + "step": 9702 + }, + { + "epoch": 1.2343213331637197, + "grad_norm": 1.51024329662323, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8645474910736084, + "num_tokens": 370257828.0, + "step": 9703 + }, + { + "epoch": 1.2344485434423103, + "grad_norm": 1.4963693618774414, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8802255392074585, + "num_tokens": 370296433.0, + "step": 9704 + }, + { + "epoch": 1.2345757537209006, + "grad_norm": 1.5085140466690063, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8788784742355347, + "num_tokens": 370331773.0, + "step": 9705 + }, + { + "epoch": 1.234702963999491, + "grad_norm": 1.4412535429000854, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8711147308349609, + "num_tokens": 370373303.0, + "step": 9706 + }, + { + "epoch": 1.2348301742780816, + "grad_norm": 1.4412205219268799, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8682950735092163, + "num_tokens": 370415469.0, + "step": 9707 + }, + { + "epoch": 1.2349573845566721, + "grad_norm": 1.457339882850647, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8784525394439697, + "num_tokens": 370452919.0, + "step": 9708 + }, + { + "epoch": 1.2350845948352627, + "grad_norm": 1.414149522781372, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8781627416610718, + "num_tokens": 370496675.0, + "step": 9709 + }, + { + "epoch": 1.2352118051138532, + "grad_norm": 1.6280498504638672, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.866301417350769, + "num_tokens": 370534461.0, + "step": 9710 + }, + { + "epoch": 1.2353390153924437, + "grad_norm": 1.4449265003204346, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.891619086265564, + "num_tokens": 370571620.0, + "step": 9711 + }, + { + "epoch": 1.2354662256710343, + "grad_norm": 1.6292269229888916, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8638168573379517, + "num_tokens": 370606901.0, + "step": 9712 + }, + { + "epoch": 1.2355934359496248, + "grad_norm": 1.4893271923065186, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.889019250869751, + "num_tokens": 370643464.0, + "step": 9713 + }, + { + "epoch": 1.2357206462282153, + "grad_norm": 1.5296053886413574, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8724966049194336, + "num_tokens": 370683861.0, + "step": 9714 + }, + { + "epoch": 1.2358478565068058, + "grad_norm": 1.6454846858978271, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8750213384628296, + "num_tokens": 370717827.0, + "step": 9715 + }, + { + "epoch": 1.2359750667853961, + "grad_norm": 1.48931884765625, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8744069933891296, + "num_tokens": 370759165.0, + "step": 9716 + }, + { + "epoch": 1.2361022770639867, + "grad_norm": 1.458945870399475, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.879148006439209, + "num_tokens": 370797769.0, + "step": 9717 + }, + { + "epoch": 1.2362294873425772, + "grad_norm": 1.4553179740905762, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8669388294219971, + "num_tokens": 370838111.0, + "step": 9718 + }, + { + "epoch": 1.2363566976211677, + "grad_norm": 1.537113904953003, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8838480710983276, + "num_tokens": 370874996.0, + "step": 9719 + }, + { + "epoch": 1.2364839078997583, + "grad_norm": 1.4424092769622803, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8796812295913696, + "num_tokens": 370913721.0, + "step": 9720 + }, + { + "epoch": 1.2366111181783488, + "grad_norm": 1.523308277130127, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8720378875732422, + "num_tokens": 370953754.0, + "step": 9721 + }, + { + "epoch": 1.2367383284569393, + "grad_norm": 1.4696015119552612, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8685779571533203, + "num_tokens": 370994907.0, + "step": 9722 + }, + { + "epoch": 1.2368655387355298, + "grad_norm": 1.455596685409546, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8779027462005615, + "num_tokens": 371040989.0, + "step": 9723 + }, + { + "epoch": 1.2369927490141204, + "grad_norm": 1.4713246822357178, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8783367872238159, + "num_tokens": 371081041.0, + "step": 9724 + }, + { + "epoch": 1.237119959292711, + "grad_norm": 1.3746877908706665, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8795446753501892, + "num_tokens": 371122964.0, + "step": 9725 + }, + { + "epoch": 1.2372471695713014, + "grad_norm": 1.4633785486221313, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8803092241287231, + "num_tokens": 371158782.0, + "step": 9726 + }, + { + "epoch": 1.237374379849892, + "grad_norm": 1.4987103939056396, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8642466068267822, + "num_tokens": 371198500.0, + "step": 9727 + }, + { + "epoch": 1.2375015901284825, + "grad_norm": 1.5956906080245972, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8681507110595703, + "num_tokens": 371236949.0, + "step": 9728 + }, + { + "epoch": 1.237628800407073, + "grad_norm": 1.5666126012802124, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8904454112052917, + "num_tokens": 371275416.0, + "step": 9729 + }, + { + "epoch": 1.2377560106856633, + "grad_norm": 1.6281547546386719, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8865963220596313, + "num_tokens": 371309201.0, + "step": 9730 + }, + { + "epoch": 1.2378832209642538, + "grad_norm": 1.5571280717849731, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8707060813903809, + "num_tokens": 371346002.0, + "step": 9731 + }, + { + "epoch": 1.2380104312428444, + "grad_norm": 1.454732060432434, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8594928979873657, + "num_tokens": 371388868.0, + "step": 9732 + }, + { + "epoch": 1.238137641521435, + "grad_norm": 1.5814763307571411, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8654545545578003, + "num_tokens": 371425980.0, + "step": 9733 + }, + { + "epoch": 1.2382648518000254, + "grad_norm": 1.5006630420684814, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8758429288864136, + "num_tokens": 371463404.0, + "step": 9734 + }, + { + "epoch": 1.238392062078616, + "grad_norm": 1.4485232830047607, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8802816867828369, + "num_tokens": 371501612.0, + "step": 9735 + }, + { + "epoch": 1.2385192723572065, + "grad_norm": 1.6161932945251465, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8535600304603577, + "num_tokens": 371541768.0, + "step": 9736 + }, + { + "epoch": 1.238646482635797, + "grad_norm": 1.4457292556762695, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8824343085289001, + "num_tokens": 371580810.0, + "step": 9737 + }, + { + "epoch": 1.2387736929143875, + "grad_norm": 1.453696608543396, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8743619322776794, + "num_tokens": 371620466.0, + "step": 9738 + }, + { + "epoch": 1.238900903192978, + "grad_norm": 1.6121723651885986, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8707088232040405, + "num_tokens": 371656117.0, + "step": 9739 + }, + { + "epoch": 1.2390281134715684, + "grad_norm": 1.4857802391052246, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8866609334945679, + "num_tokens": 371691829.0, + "step": 9740 + }, + { + "epoch": 1.239155323750159, + "grad_norm": 1.3959004878997803, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8843494653701782, + "num_tokens": 371733568.0, + "step": 9741 + }, + { + "epoch": 1.2392825340287494, + "grad_norm": 1.3871080875396729, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.888953447341919, + "num_tokens": 371775331.0, + "step": 9742 + }, + { + "epoch": 1.23940974430734, + "grad_norm": 1.7371901273727417, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8659020662307739, + "num_tokens": 371815945.0, + "step": 9743 + }, + { + "epoch": 1.2395369545859305, + "grad_norm": 1.4606610536575317, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8755055069923401, + "num_tokens": 371855197.0, + "step": 9744 + }, + { + "epoch": 1.239664164864521, + "grad_norm": 1.5588339567184448, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8732923269271851, + "num_tokens": 371893882.0, + "step": 9745 + }, + { + "epoch": 1.2397913751431116, + "grad_norm": 1.588287591934204, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8531787395477295, + "num_tokens": 371932694.0, + "step": 9746 + }, + { + "epoch": 1.239918585421702, + "grad_norm": 1.8766487836837769, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8740654587745667, + "num_tokens": 371963724.0, + "step": 9747 + }, + { + "epoch": 1.2400457957002926, + "grad_norm": 1.675235629081726, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8666914105415344, + "num_tokens": 372002220.0, + "step": 9748 + }, + { + "epoch": 1.2401730059788831, + "grad_norm": 1.4204466342926025, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8840236663818359, + "num_tokens": 372044679.0, + "step": 9749 + }, + { + "epoch": 1.2403002162574737, + "grad_norm": 1.5686720609664917, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8833060264587402, + "num_tokens": 372081186.0, + "step": 9750 + }, + { + "epoch": 1.2404274265360642, + "grad_norm": 1.4554684162139893, + "learning_rate": 1e-06, + "loss": 0.2849, + "mean_token_accuracy": 0.8969652056694031, + "num_tokens": 372119700.0, + "step": 9751 + }, + { + "epoch": 1.2405546368146547, + "grad_norm": 1.4947234392166138, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8767974376678467, + "num_tokens": 372158443.0, + "step": 9752 + }, + { + "epoch": 1.2406818470932452, + "grad_norm": 1.5658060312271118, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8772538900375366, + "num_tokens": 372193855.0, + "step": 9753 + }, + { + "epoch": 1.2408090573718356, + "grad_norm": 1.4679580926895142, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8830124735832214, + "num_tokens": 372232991.0, + "step": 9754 + }, + { + "epoch": 1.240936267650426, + "grad_norm": 1.558342456817627, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8737558126449585, + "num_tokens": 372267919.0, + "step": 9755 + }, + { + "epoch": 1.2410634779290166, + "grad_norm": 1.5079368352890015, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8764265775680542, + "num_tokens": 372307381.0, + "step": 9756 + }, + { + "epoch": 1.2411906882076071, + "grad_norm": 1.6730194091796875, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8466750979423523, + "num_tokens": 372346074.0, + "step": 9757 + }, + { + "epoch": 1.2413178984861977, + "grad_norm": 1.6913025379180908, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.862672746181488, + "num_tokens": 372377841.0, + "step": 9758 + }, + { + "epoch": 1.2414451087647882, + "grad_norm": 1.4962551593780518, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8817071318626404, + "num_tokens": 372414433.0, + "step": 9759 + }, + { + "epoch": 1.2415723190433787, + "grad_norm": 1.4234107732772827, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8864583969116211, + "num_tokens": 372456510.0, + "step": 9760 + }, + { + "epoch": 1.2416995293219693, + "grad_norm": 1.558876872062683, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8764100074768066, + "num_tokens": 372496311.0, + "step": 9761 + }, + { + "epoch": 1.2418267396005598, + "grad_norm": 1.3967938423156738, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8626744747161865, + "num_tokens": 372541064.0, + "step": 9762 + }, + { + "epoch": 1.2419539498791503, + "grad_norm": 1.5218256711959839, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8685238361358643, + "num_tokens": 372582711.0, + "step": 9763 + }, + { + "epoch": 1.2420811601577408, + "grad_norm": 1.4617340564727783, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8814390301704407, + "num_tokens": 372618775.0, + "step": 9764 + }, + { + "epoch": 1.2422083704363311, + "grad_norm": 1.471879005432129, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8743196725845337, + "num_tokens": 372657955.0, + "step": 9765 + }, + { + "epoch": 1.2423355807149217, + "grad_norm": 1.3967540264129639, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8740487694740295, + "num_tokens": 372700048.0, + "step": 9766 + }, + { + "epoch": 1.2424627909935122, + "grad_norm": 1.436090111732483, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8729332089424133, + "num_tokens": 372742341.0, + "step": 9767 + }, + { + "epoch": 1.2425900012721027, + "grad_norm": 1.5573465824127197, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8672568798065186, + "num_tokens": 372776760.0, + "step": 9768 + }, + { + "epoch": 1.2427172115506933, + "grad_norm": 1.3945814371109009, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8802098035812378, + "num_tokens": 372817108.0, + "step": 9769 + }, + { + "epoch": 1.2428444218292838, + "grad_norm": 1.477276086807251, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.865777850151062, + "num_tokens": 372856976.0, + "step": 9770 + }, + { + "epoch": 1.2429716321078743, + "grad_norm": 1.5664360523223877, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8440878987312317, + "num_tokens": 372896376.0, + "step": 9771 + }, + { + "epoch": 1.2430988423864648, + "grad_norm": 1.537300944328308, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8725283145904541, + "num_tokens": 372932114.0, + "step": 9772 + }, + { + "epoch": 1.2432260526650554, + "grad_norm": 1.4000961780548096, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8868036270141602, + "num_tokens": 372972557.0, + "step": 9773 + }, + { + "epoch": 1.243353262943646, + "grad_norm": 1.545478343963623, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8759939670562744, + "num_tokens": 373014111.0, + "step": 9774 + }, + { + "epoch": 1.2434804732222364, + "grad_norm": 1.4950555562973022, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8736721277236938, + "num_tokens": 373049739.0, + "step": 9775 + }, + { + "epoch": 1.243607683500827, + "grad_norm": 1.4571038484573364, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8567402958869934, + "num_tokens": 373092894.0, + "step": 9776 + }, + { + "epoch": 1.2437348937794175, + "grad_norm": 1.430544376373291, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8853198289871216, + "num_tokens": 373131767.0, + "step": 9777 + }, + { + "epoch": 1.243862104058008, + "grad_norm": 1.5850346088409424, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8583742380142212, + "num_tokens": 373171310.0, + "step": 9778 + }, + { + "epoch": 1.2439893143365983, + "grad_norm": 1.3866156339645386, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8871159553527832, + "num_tokens": 373211454.0, + "step": 9779 + }, + { + "epoch": 1.2441165246151888, + "grad_norm": 1.346570372581482, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8791340589523315, + "num_tokens": 373255535.0, + "step": 9780 + }, + { + "epoch": 1.2442437348937794, + "grad_norm": 1.3993152379989624, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8825695514678955, + "num_tokens": 373297993.0, + "step": 9781 + }, + { + "epoch": 1.24437094517237, + "grad_norm": 1.426040530204773, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8832119703292847, + "num_tokens": 373338042.0, + "step": 9782 + }, + { + "epoch": 1.2444981554509604, + "grad_norm": 1.5415579080581665, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8624116778373718, + "num_tokens": 373377042.0, + "step": 9783 + }, + { + "epoch": 1.244625365729551, + "grad_norm": 1.5447642803192139, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8798099756240845, + "num_tokens": 373415377.0, + "step": 9784 + }, + { + "epoch": 1.2447525760081415, + "grad_norm": 1.5727064609527588, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8727405071258545, + "num_tokens": 373454320.0, + "step": 9785 + }, + { + "epoch": 1.244879786286732, + "grad_norm": 1.4399654865264893, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8729455471038818, + "num_tokens": 373495915.0, + "step": 9786 + }, + { + "epoch": 1.2450069965653225, + "grad_norm": 1.36431884765625, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8849135637283325, + "num_tokens": 373537044.0, + "step": 9787 + }, + { + "epoch": 1.245134206843913, + "grad_norm": 1.4584453105926514, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8910005688667297, + "num_tokens": 373573404.0, + "step": 9788 + }, + { + "epoch": 1.2452614171225034, + "grad_norm": 1.584215521812439, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8703967332839966, + "num_tokens": 373608541.0, + "step": 9789 + }, + { + "epoch": 1.245388627401094, + "grad_norm": 1.4507479667663574, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8629399538040161, + "num_tokens": 373654282.0, + "step": 9790 + }, + { + "epoch": 1.2455158376796844, + "grad_norm": 1.5271064043045044, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8898497819900513, + "num_tokens": 373687736.0, + "step": 9791 + }, + { + "epoch": 1.245643047958275, + "grad_norm": 1.544347882270813, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8720476627349854, + "num_tokens": 373726207.0, + "step": 9792 + }, + { + "epoch": 1.2457702582368655, + "grad_norm": 1.4418613910675049, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8722074031829834, + "num_tokens": 373767443.0, + "step": 9793 + }, + { + "epoch": 1.245897468515456, + "grad_norm": 1.3534537553787231, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8864879608154297, + "num_tokens": 373809340.0, + "step": 9794 + }, + { + "epoch": 1.2460246787940465, + "grad_norm": 1.526304006576538, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8826709985733032, + "num_tokens": 373845902.0, + "step": 9795 + }, + { + "epoch": 1.246151889072637, + "grad_norm": 1.5301687717437744, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.865852952003479, + "num_tokens": 373883973.0, + "step": 9796 + }, + { + "epoch": 1.2462790993512276, + "grad_norm": 1.5779589414596558, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8733436465263367, + "num_tokens": 373918732.0, + "step": 9797 + }, + { + "epoch": 1.2464063096298181, + "grad_norm": 1.5262279510498047, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8707844614982605, + "num_tokens": 373959772.0, + "step": 9798 + }, + { + "epoch": 1.2465335199084087, + "grad_norm": 1.5785421133041382, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8634604811668396, + "num_tokens": 374000151.0, + "step": 9799 + }, + { + "epoch": 1.2466607301869992, + "grad_norm": 1.4620577096939087, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8785961866378784, + "num_tokens": 374039415.0, + "step": 9800 + }, + { + "epoch": 1.2467879404655897, + "grad_norm": 1.4736899137496948, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.877009391784668, + "num_tokens": 374083568.0, + "step": 9801 + }, + { + "epoch": 1.2469151507441802, + "grad_norm": 1.5883293151855469, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.877332866191864, + "num_tokens": 374118762.0, + "step": 9802 + }, + { + "epoch": 1.2470423610227706, + "grad_norm": 1.4835422039031982, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8695094585418701, + "num_tokens": 374160262.0, + "step": 9803 + }, + { + "epoch": 1.247169571301361, + "grad_norm": 1.5143303871154785, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8699629306793213, + "num_tokens": 374200619.0, + "step": 9804 + }, + { + "epoch": 1.2472967815799516, + "grad_norm": 1.430777668952942, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8833006620407104, + "num_tokens": 374238303.0, + "step": 9805 + }, + { + "epoch": 1.2474239918585421, + "grad_norm": 1.4221580028533936, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8873348832130432, + "num_tokens": 374278849.0, + "step": 9806 + }, + { + "epoch": 1.2475512021371327, + "grad_norm": 1.6105133295059204, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.869411826133728, + "num_tokens": 374320983.0, + "step": 9807 + }, + { + "epoch": 1.2476784124157232, + "grad_norm": 1.4812002182006836, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8908604383468628, + "num_tokens": 374354520.0, + "step": 9808 + }, + { + "epoch": 1.2478056226943137, + "grad_norm": 1.656929850578308, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8667598962783813, + "num_tokens": 374388411.0, + "step": 9809 + }, + { + "epoch": 1.2479328329729042, + "grad_norm": 1.5756173133850098, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.866439700126648, + "num_tokens": 374425076.0, + "step": 9810 + }, + { + "epoch": 1.2480600432514948, + "grad_norm": 1.4547501802444458, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8820434808731079, + "num_tokens": 374466219.0, + "step": 9811 + }, + { + "epoch": 1.2481872535300853, + "grad_norm": 1.7895092964172363, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8788865804672241, + "num_tokens": 374497652.0, + "step": 9812 + }, + { + "epoch": 1.2483144638086758, + "grad_norm": 1.4678606986999512, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8724057674407959, + "num_tokens": 374535379.0, + "step": 9813 + }, + { + "epoch": 1.2484416740872661, + "grad_norm": 1.394546627998352, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8753364086151123, + "num_tokens": 374579779.0, + "step": 9814 + }, + { + "epoch": 1.2485688843658567, + "grad_norm": 1.4432538747787476, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.877907931804657, + "num_tokens": 374620582.0, + "step": 9815 + }, + { + "epoch": 1.2486960946444472, + "grad_norm": 1.4991915225982666, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.870439887046814, + "num_tokens": 374659470.0, + "step": 9816 + }, + { + "epoch": 1.2488233049230377, + "grad_norm": 1.4350351095199585, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8714229464530945, + "num_tokens": 374704394.0, + "step": 9817 + }, + { + "epoch": 1.2489505152016283, + "grad_norm": 1.4298208951950073, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8807979822158813, + "num_tokens": 374745338.0, + "step": 9818 + }, + { + "epoch": 1.2490777254802188, + "grad_norm": 1.6518176794052124, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8704103231430054, + "num_tokens": 374781079.0, + "step": 9819 + }, + { + "epoch": 1.2492049357588093, + "grad_norm": 1.5104570388793945, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8730655908584595, + "num_tokens": 374820301.0, + "step": 9820 + }, + { + "epoch": 1.2493321460373998, + "grad_norm": 1.4979852437973022, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8745800256729126, + "num_tokens": 374858194.0, + "step": 9821 + }, + { + "epoch": 1.2494593563159904, + "grad_norm": 1.4715067148208618, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8747946619987488, + "num_tokens": 374896818.0, + "step": 9822 + }, + { + "epoch": 1.249586566594581, + "grad_norm": 1.4807944297790527, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8700335025787354, + "num_tokens": 374937202.0, + "step": 9823 + }, + { + "epoch": 1.2497137768731714, + "grad_norm": 1.522231101989746, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.880821704864502, + "num_tokens": 374971036.0, + "step": 9824 + }, + { + "epoch": 1.249840987151762, + "grad_norm": 1.4427947998046875, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8661296367645264, + "num_tokens": 375015280.0, + "step": 9825 + }, + { + "epoch": 1.2499681974303525, + "grad_norm": 1.4104654788970947, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8810379505157471, + "num_tokens": 375056517.0, + "step": 9826 + }, + { + "epoch": 1.250095407708943, + "grad_norm": 1.5479775667190552, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8667185306549072, + "num_tokens": 375096874.0, + "step": 9827 + }, + { + "epoch": 1.2502226179875333, + "grad_norm": 1.5017890930175781, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8763794302940369, + "num_tokens": 375134957.0, + "step": 9828 + }, + { + "epoch": 1.2503498282661238, + "grad_norm": 1.479416012763977, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8719146251678467, + "num_tokens": 375173149.0, + "step": 9829 + }, + { + "epoch": 1.2504770385447144, + "grad_norm": 1.6526306867599487, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8781691193580627, + "num_tokens": 375205403.0, + "step": 9830 + }, + { + "epoch": 1.250604248823305, + "grad_norm": 1.5341969728469849, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8704750537872314, + "num_tokens": 375244082.0, + "step": 9831 + }, + { + "epoch": 1.2507314591018954, + "grad_norm": 1.4568380117416382, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8792091608047485, + "num_tokens": 375285459.0, + "step": 9832 + }, + { + "epoch": 1.250858669380486, + "grad_norm": 1.5508283376693726, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8730295896530151, + "num_tokens": 375319766.0, + "step": 9833 + }, + { + "epoch": 1.2509858796590765, + "grad_norm": 1.5275579690933228, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8718044757843018, + "num_tokens": 375356945.0, + "step": 9834 + }, + { + "epoch": 1.251113089937667, + "grad_norm": 1.8547172546386719, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8673529624938965, + "num_tokens": 375387296.0, + "step": 9835 + }, + { + "epoch": 1.2512403002162575, + "grad_norm": 1.451859474182129, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8711003661155701, + "num_tokens": 375427961.0, + "step": 9836 + }, + { + "epoch": 1.2513675104948478, + "grad_norm": 1.4609181880950928, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8776254653930664, + "num_tokens": 375469389.0, + "step": 9837 + }, + { + "epoch": 1.2514947207734384, + "grad_norm": 1.5236730575561523, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.873319149017334, + "num_tokens": 375508004.0, + "step": 9838 + }, + { + "epoch": 1.251621931052029, + "grad_norm": 1.4875565767288208, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8768049478530884, + "num_tokens": 375547965.0, + "step": 9839 + }, + { + "epoch": 1.2517491413306194, + "grad_norm": 1.4125699996948242, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8808096647262573, + "num_tokens": 375588445.0, + "step": 9840 + }, + { + "epoch": 1.25187635160921, + "grad_norm": 1.4510496854782104, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8713249564170837, + "num_tokens": 375628894.0, + "step": 9841 + }, + { + "epoch": 1.2520035618878005, + "grad_norm": 1.5181725025177002, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8653683662414551, + "num_tokens": 375667143.0, + "step": 9842 + }, + { + "epoch": 1.252130772166391, + "grad_norm": 1.4851981401443481, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8787345886230469, + "num_tokens": 375706633.0, + "step": 9843 + }, + { + "epoch": 1.2522579824449815, + "grad_norm": 1.6382486820220947, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8835679888725281, + "num_tokens": 375741242.0, + "step": 9844 + }, + { + "epoch": 1.252385192723572, + "grad_norm": 1.5926915407180786, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8826691508293152, + "num_tokens": 375777435.0, + "step": 9845 + }, + { + "epoch": 1.2525124030021626, + "grad_norm": 1.5125072002410889, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8703193664550781, + "num_tokens": 375818901.0, + "step": 9846 + }, + { + "epoch": 1.2526396132807531, + "grad_norm": 1.3266034126281738, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8827780485153198, + "num_tokens": 375866427.0, + "step": 9847 + }, + { + "epoch": 1.2527668235593437, + "grad_norm": 1.5762914419174194, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8764435648918152, + "num_tokens": 375904723.0, + "step": 9848 + }, + { + "epoch": 1.2528940338379342, + "grad_norm": 1.5208066701889038, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.879062294960022, + "num_tokens": 375942604.0, + "step": 9849 + }, + { + "epoch": 1.2530212441165247, + "grad_norm": 1.6217355728149414, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8761030435562134, + "num_tokens": 375976569.0, + "step": 9850 + }, + { + "epoch": 1.2531484543951152, + "grad_norm": 1.5109848976135254, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8726272583007812, + "num_tokens": 376016746.0, + "step": 9851 + }, + { + "epoch": 1.2532756646737058, + "grad_norm": 1.451622724533081, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8720895051956177, + "num_tokens": 376056749.0, + "step": 9852 + }, + { + "epoch": 1.253402874952296, + "grad_norm": 1.4692738056182861, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8770409226417542, + "num_tokens": 376096787.0, + "step": 9853 + }, + { + "epoch": 1.2535300852308866, + "grad_norm": 1.4322478771209717, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8810653686523438, + "num_tokens": 376136055.0, + "step": 9854 + }, + { + "epoch": 1.2536572955094771, + "grad_norm": 1.586060881614685, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8714101314544678, + "num_tokens": 376174621.0, + "step": 9855 + }, + { + "epoch": 1.2537845057880677, + "grad_norm": 1.4896305799484253, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8748044371604919, + "num_tokens": 376214407.0, + "step": 9856 + }, + { + "epoch": 1.2539117160666582, + "grad_norm": 1.4882224798202515, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8886678218841553, + "num_tokens": 376251240.0, + "step": 9857 + }, + { + "epoch": 1.2540389263452487, + "grad_norm": 1.4781941175460815, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8779300451278687, + "num_tokens": 376290959.0, + "step": 9858 + }, + { + "epoch": 1.2541661366238392, + "grad_norm": 1.5258303880691528, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8821245431900024, + "num_tokens": 376328282.0, + "step": 9859 + }, + { + "epoch": 1.2542933469024298, + "grad_norm": 1.684970498085022, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8760812878608704, + "num_tokens": 376366266.0, + "step": 9860 + }, + { + "epoch": 1.2544205571810203, + "grad_norm": 1.6371163129806519, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8577734231948853, + "num_tokens": 376404194.0, + "step": 9861 + }, + { + "epoch": 1.2545477674596106, + "grad_norm": 1.9899024963378906, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.886950671672821, + "num_tokens": 376438664.0, + "step": 9862 + }, + { + "epoch": 1.2546749777382011, + "grad_norm": 1.5251003503799438, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8666768670082092, + "num_tokens": 376477565.0, + "step": 9863 + }, + { + "epoch": 1.2548021880167917, + "grad_norm": 1.4877609014511108, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8842176795005798, + "num_tokens": 376513634.0, + "step": 9864 + }, + { + "epoch": 1.2549293982953822, + "grad_norm": 1.3737971782684326, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8741439580917358, + "num_tokens": 376556957.0, + "step": 9865 + }, + { + "epoch": 1.2550566085739727, + "grad_norm": 1.7319778203964233, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8621336817741394, + "num_tokens": 376593986.0, + "step": 9866 + }, + { + "epoch": 1.2551838188525632, + "grad_norm": 1.6229969263076782, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8694841265678406, + "num_tokens": 376630344.0, + "step": 9867 + }, + { + "epoch": 1.2553110291311538, + "grad_norm": 1.4618397951126099, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.881432294845581, + "num_tokens": 376671556.0, + "step": 9868 + }, + { + "epoch": 1.2554382394097443, + "grad_norm": 1.4050244092941284, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8844696283340454, + "num_tokens": 376713127.0, + "step": 9869 + }, + { + "epoch": 1.2555654496883348, + "grad_norm": 1.6046788692474365, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8676345944404602, + "num_tokens": 376751505.0, + "step": 9870 + }, + { + "epoch": 1.2556926599669254, + "grad_norm": 1.4590829610824585, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8738210201263428, + "num_tokens": 376790822.0, + "step": 9871 + }, + { + "epoch": 1.255819870245516, + "grad_norm": 1.4537906646728516, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8774878978729248, + "num_tokens": 376834412.0, + "step": 9872 + }, + { + "epoch": 1.2559470805241064, + "grad_norm": 1.4505528211593628, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.865299642086029, + "num_tokens": 376877433.0, + "step": 9873 + }, + { + "epoch": 1.256074290802697, + "grad_norm": 1.6189380884170532, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8749375343322754, + "num_tokens": 376914797.0, + "step": 9874 + }, + { + "epoch": 1.2562015010812875, + "grad_norm": 1.544562816619873, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8701266050338745, + "num_tokens": 376954575.0, + "step": 9875 + }, + { + "epoch": 1.256328711359878, + "grad_norm": 1.5904544591903687, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8873928189277649, + "num_tokens": 376990578.0, + "step": 9876 + }, + { + "epoch": 1.2564559216384683, + "grad_norm": 1.2861766815185547, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8948967456817627, + "num_tokens": 377034697.0, + "step": 9877 + }, + { + "epoch": 1.2565831319170588, + "grad_norm": 1.4089505672454834, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8845434188842773, + "num_tokens": 377075008.0, + "step": 9878 + }, + { + "epoch": 1.2567103421956494, + "grad_norm": 1.6658339500427246, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8684157133102417, + "num_tokens": 377110032.0, + "step": 9879 + }, + { + "epoch": 1.25683755247424, + "grad_norm": 1.4951692819595337, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8778396844863892, + "num_tokens": 377147797.0, + "step": 9880 + }, + { + "epoch": 1.2569647627528304, + "grad_norm": 1.5082279443740845, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8773724436759949, + "num_tokens": 377184920.0, + "step": 9881 + }, + { + "epoch": 1.257091973031421, + "grad_norm": 1.6147128343582153, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8805530071258545, + "num_tokens": 377217287.0, + "step": 9882 + }, + { + "epoch": 1.2572191833100115, + "grad_norm": 1.5009461641311646, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8840714693069458, + "num_tokens": 377255120.0, + "step": 9883 + }, + { + "epoch": 1.257346393588602, + "grad_norm": 1.440744161605835, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8663166761398315, + "num_tokens": 377299682.0, + "step": 9884 + }, + { + "epoch": 1.2574736038671925, + "grad_norm": 1.5233150720596313, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8663949966430664, + "num_tokens": 377342718.0, + "step": 9885 + }, + { + "epoch": 1.2576008141457828, + "grad_norm": 1.3997912406921387, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.884955883026123, + "num_tokens": 377380031.0, + "step": 9886 + }, + { + "epoch": 1.2577280244243734, + "grad_norm": 1.4995514154434204, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.885014533996582, + "num_tokens": 377416058.0, + "step": 9887 + }, + { + "epoch": 1.257855234702964, + "grad_norm": 1.5594356060028076, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.864799976348877, + "num_tokens": 377455817.0, + "step": 9888 + }, + { + "epoch": 1.2579824449815544, + "grad_norm": 1.6739710569381714, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8631983995437622, + "num_tokens": 377492316.0, + "step": 9889 + }, + { + "epoch": 1.258109655260145, + "grad_norm": 1.5886071920394897, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8750330209732056, + "num_tokens": 377528719.0, + "step": 9890 + }, + { + "epoch": 1.2582368655387355, + "grad_norm": 1.5434553623199463, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8570058345794678, + "num_tokens": 377571622.0, + "step": 9891 + }, + { + "epoch": 1.258364075817326, + "grad_norm": 1.6206684112548828, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8645687103271484, + "num_tokens": 377606473.0, + "step": 9892 + }, + { + "epoch": 1.2584912860959165, + "grad_norm": 1.4559205770492554, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.87261962890625, + "num_tokens": 377649485.0, + "step": 9893 + }, + { + "epoch": 1.258618496374507, + "grad_norm": 1.4737504720687866, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8873515129089355, + "num_tokens": 377689039.0, + "step": 9894 + }, + { + "epoch": 1.2587457066530976, + "grad_norm": 1.487451195716858, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8763865232467651, + "num_tokens": 377727846.0, + "step": 9895 + }, + { + "epoch": 1.2588729169316881, + "grad_norm": 1.7327311038970947, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8697972297668457, + "num_tokens": 377759371.0, + "step": 9896 + }, + { + "epoch": 1.2590001272102787, + "grad_norm": 1.4575748443603516, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8829978704452515, + "num_tokens": 377799952.0, + "step": 9897 + }, + { + "epoch": 1.2591273374888692, + "grad_norm": 1.5237263441085815, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8791726231575012, + "num_tokens": 377837794.0, + "step": 9898 + }, + { + "epoch": 1.2592545477674597, + "grad_norm": 1.5131586790084839, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8746880888938904, + "num_tokens": 377879482.0, + "step": 9899 + }, + { + "epoch": 1.2593817580460502, + "grad_norm": 1.609216570854187, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8753423094749451, + "num_tokens": 377917651.0, + "step": 9900 + }, + { + "epoch": 1.2595089683246408, + "grad_norm": 1.513792634010315, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8844442367553711, + "num_tokens": 377953399.0, + "step": 9901 + }, + { + "epoch": 1.259636178603231, + "grad_norm": 1.6227492094039917, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8613047003746033, + "num_tokens": 377994125.0, + "step": 9902 + }, + { + "epoch": 1.2597633888818216, + "grad_norm": 1.4602352380752563, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8827651143074036, + "num_tokens": 378034891.0, + "step": 9903 + }, + { + "epoch": 1.2598905991604121, + "grad_norm": 1.5416858196258545, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8843080997467041, + "num_tokens": 378070107.0, + "step": 9904 + }, + { + "epoch": 1.2600178094390027, + "grad_norm": 1.7081941366195679, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8732966780662537, + "num_tokens": 378102078.0, + "step": 9905 + }, + { + "epoch": 1.2601450197175932, + "grad_norm": 1.5131028890609741, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8837864398956299, + "num_tokens": 378141807.0, + "step": 9906 + }, + { + "epoch": 1.2602722299961837, + "grad_norm": 1.5428112745285034, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8755606412887573, + "num_tokens": 378181888.0, + "step": 9907 + }, + { + "epoch": 1.2603994402747742, + "grad_norm": 1.5244444608688354, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8761920928955078, + "num_tokens": 378219498.0, + "step": 9908 + }, + { + "epoch": 1.2605266505533648, + "grad_norm": 1.4838814735412598, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8873589038848877, + "num_tokens": 378256587.0, + "step": 9909 + }, + { + "epoch": 1.2606538608319553, + "grad_norm": 1.5687172412872314, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8728095889091492, + "num_tokens": 378295241.0, + "step": 9910 + }, + { + "epoch": 1.2607810711105456, + "grad_norm": 1.4087294340133667, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8777635097503662, + "num_tokens": 378334939.0, + "step": 9911 + }, + { + "epoch": 1.2609082813891361, + "grad_norm": 1.568664312362671, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8839575052261353, + "num_tokens": 378368481.0, + "step": 9912 + }, + { + "epoch": 1.2610354916677267, + "grad_norm": 1.5777944326400757, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8782394528388977, + "num_tokens": 378404544.0, + "step": 9913 + }, + { + "epoch": 1.2611627019463172, + "grad_norm": 1.4417905807495117, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8857821226119995, + "num_tokens": 378440816.0, + "step": 9914 + }, + { + "epoch": 1.2612899122249077, + "grad_norm": 1.517418622970581, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8874356150627136, + "num_tokens": 378478586.0, + "step": 9915 + }, + { + "epoch": 1.2614171225034982, + "grad_norm": 1.6435471773147583, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.869349479675293, + "num_tokens": 378512947.0, + "step": 9916 + }, + { + "epoch": 1.2615443327820888, + "grad_norm": 1.5393952131271362, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.883241593837738, + "num_tokens": 378548291.0, + "step": 9917 + }, + { + "epoch": 1.2616715430606793, + "grad_norm": 1.5786685943603516, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8837493062019348, + "num_tokens": 378580549.0, + "step": 9918 + }, + { + "epoch": 1.2617987533392698, + "grad_norm": 1.4178037643432617, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8914907574653625, + "num_tokens": 378620531.0, + "step": 9919 + }, + { + "epoch": 1.2619259636178604, + "grad_norm": 1.5050605535507202, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8784273266792297, + "num_tokens": 378657448.0, + "step": 9920 + }, + { + "epoch": 1.2620531738964509, + "grad_norm": 1.4348831176757812, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8712373375892639, + "num_tokens": 378698780.0, + "step": 9921 + }, + { + "epoch": 1.2621803841750414, + "grad_norm": 1.636230707168579, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8894757032394409, + "num_tokens": 378728665.0, + "step": 9922 + }, + { + "epoch": 1.262307594453632, + "grad_norm": 1.5348354578018188, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.876625120639801, + "num_tokens": 378763385.0, + "step": 9923 + }, + { + "epoch": 1.2624348047322225, + "grad_norm": 1.3074235916137695, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8928239345550537, + "num_tokens": 378807132.0, + "step": 9924 + }, + { + "epoch": 1.262562015010813, + "grad_norm": 1.4312480688095093, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8860864043235779, + "num_tokens": 378844605.0, + "step": 9925 + }, + { + "epoch": 1.2626892252894033, + "grad_norm": 1.5442514419555664, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8850363492965698, + "num_tokens": 378880083.0, + "step": 9926 + }, + { + "epoch": 1.2628164355679938, + "grad_norm": 1.4491816759109497, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.864814043045044, + "num_tokens": 378922832.0, + "step": 9927 + }, + { + "epoch": 1.2629436458465844, + "grad_norm": 1.4621027708053589, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8756910562515259, + "num_tokens": 378963771.0, + "step": 9928 + }, + { + "epoch": 1.263070856125175, + "grad_norm": 1.4528008699417114, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8747481107711792, + "num_tokens": 379003967.0, + "step": 9929 + }, + { + "epoch": 1.2631980664037654, + "grad_norm": 1.574625849723816, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8912897706031799, + "num_tokens": 379036249.0, + "step": 9930 + }, + { + "epoch": 1.263325276682356, + "grad_norm": 1.5735350847244263, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.875511884689331, + "num_tokens": 379069850.0, + "step": 9931 + }, + { + "epoch": 1.2634524869609465, + "grad_norm": 2.2206101417541504, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8800442814826965, + "num_tokens": 379104578.0, + "step": 9932 + }, + { + "epoch": 1.263579697239537, + "grad_norm": 1.4605907201766968, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8875345587730408, + "num_tokens": 379142059.0, + "step": 9933 + }, + { + "epoch": 1.2637069075181275, + "grad_norm": 1.5057024955749512, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8646167516708374, + "num_tokens": 379187393.0, + "step": 9934 + }, + { + "epoch": 1.2638341177967178, + "grad_norm": 1.50607168674469, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8759032487869263, + "num_tokens": 379222988.0, + "step": 9935 + }, + { + "epoch": 1.2639613280753084, + "grad_norm": 1.6533616781234741, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8774304986000061, + "num_tokens": 379255358.0, + "step": 9936 + }, + { + "epoch": 1.264088538353899, + "grad_norm": 1.436353325843811, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8625010251998901, + "num_tokens": 379298065.0, + "step": 9937 + }, + { + "epoch": 1.2642157486324894, + "grad_norm": 1.4724946022033691, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8725599050521851, + "num_tokens": 379336223.0, + "step": 9938 + }, + { + "epoch": 1.26434295891108, + "grad_norm": 1.5680636167526245, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8607856035232544, + "num_tokens": 379377076.0, + "step": 9939 + }, + { + "epoch": 1.2644701691896705, + "grad_norm": 1.502000093460083, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8630355596542358, + "num_tokens": 379421573.0, + "step": 9940 + }, + { + "epoch": 1.264597379468261, + "grad_norm": 1.5098450183868408, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8875505924224854, + "num_tokens": 379458515.0, + "step": 9941 + }, + { + "epoch": 1.2647245897468515, + "grad_norm": 1.5618828535079956, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.869147539138794, + "num_tokens": 379496509.0, + "step": 9942 + }, + { + "epoch": 1.264851800025442, + "grad_norm": 1.4825812578201294, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8737256526947021, + "num_tokens": 379538996.0, + "step": 9943 + }, + { + "epoch": 1.2649790103040326, + "grad_norm": 1.5257704257965088, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8639700412750244, + "num_tokens": 379580595.0, + "step": 9944 + }, + { + "epoch": 1.2651062205826231, + "grad_norm": 1.338701844215393, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8877406716346741, + "num_tokens": 379622982.0, + "step": 9945 + }, + { + "epoch": 1.2652334308612136, + "grad_norm": 1.4859960079193115, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8745348453521729, + "num_tokens": 379658796.0, + "step": 9946 + }, + { + "epoch": 1.2653606411398042, + "grad_norm": 1.5385785102844238, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8785693049430847, + "num_tokens": 379695509.0, + "step": 9947 + }, + { + "epoch": 1.2654878514183947, + "grad_norm": 1.4741151332855225, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8733913898468018, + "num_tokens": 379732692.0, + "step": 9948 + }, + { + "epoch": 1.2656150616969852, + "grad_norm": 1.5101289749145508, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8796016573905945, + "num_tokens": 379768716.0, + "step": 9949 + }, + { + "epoch": 1.2657422719755758, + "grad_norm": 1.551232099533081, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8893947005271912, + "num_tokens": 379804857.0, + "step": 9950 + }, + { + "epoch": 1.265869482254166, + "grad_norm": 1.4566549062728882, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8788248300552368, + "num_tokens": 379842882.0, + "step": 9951 + }, + { + "epoch": 1.2659966925327566, + "grad_norm": 1.459192156791687, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8752956390380859, + "num_tokens": 379887099.0, + "step": 9952 + }, + { + "epoch": 1.2661239028113471, + "grad_norm": 1.4357887506484985, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8868805766105652, + "num_tokens": 379927598.0, + "step": 9953 + }, + { + "epoch": 1.2662511130899377, + "grad_norm": 1.464952826499939, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8778121471405029, + "num_tokens": 379964548.0, + "step": 9954 + }, + { + "epoch": 1.2663783233685282, + "grad_norm": 1.606313943862915, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8811874389648438, + "num_tokens": 380003924.0, + "step": 9955 + }, + { + "epoch": 1.2665055336471187, + "grad_norm": 1.5557411909103394, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8851096034049988, + "num_tokens": 380039966.0, + "step": 9956 + }, + { + "epoch": 1.2666327439257092, + "grad_norm": 1.5831272602081299, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8738771080970764, + "num_tokens": 380073340.0, + "step": 9957 + }, + { + "epoch": 1.2667599542042998, + "grad_norm": 1.4589225053787231, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8700987100601196, + "num_tokens": 380113687.0, + "step": 9958 + }, + { + "epoch": 1.2668871644828903, + "grad_norm": 1.5255039930343628, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.870093047618866, + "num_tokens": 380154795.0, + "step": 9959 + }, + { + "epoch": 1.2670143747614806, + "grad_norm": 1.5131433010101318, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8812907338142395, + "num_tokens": 380192667.0, + "step": 9960 + }, + { + "epoch": 1.2671415850400711, + "grad_norm": 1.5328142642974854, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8854489326477051, + "num_tokens": 380226023.0, + "step": 9961 + }, + { + "epoch": 1.2672687953186617, + "grad_norm": 1.5175904035568237, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8728869557380676, + "num_tokens": 380262321.0, + "step": 9962 + }, + { + "epoch": 1.2673960055972522, + "grad_norm": 1.3248283863067627, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8937629461288452, + "num_tokens": 380303331.0, + "step": 9963 + }, + { + "epoch": 1.2675232158758427, + "grad_norm": 1.4259648323059082, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8914852142333984, + "num_tokens": 380341654.0, + "step": 9964 + }, + { + "epoch": 1.2676504261544332, + "grad_norm": 1.4857515096664429, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8872864842414856, + "num_tokens": 380377187.0, + "step": 9965 + }, + { + "epoch": 1.2677776364330238, + "grad_norm": 1.6295738220214844, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8700284361839294, + "num_tokens": 380411609.0, + "step": 9966 + }, + { + "epoch": 1.2679048467116143, + "grad_norm": 1.4915707111358643, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8786308765411377, + "num_tokens": 380451372.0, + "step": 9967 + }, + { + "epoch": 1.2680320569902048, + "grad_norm": 1.5600504875183105, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8699585199356079, + "num_tokens": 380489626.0, + "step": 9968 + }, + { + "epoch": 1.2681592672687954, + "grad_norm": 1.5181406736373901, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8885263800621033, + "num_tokens": 380528945.0, + "step": 9969 + }, + { + "epoch": 1.2682864775473859, + "grad_norm": 1.3745229244232178, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8726654052734375, + "num_tokens": 380573751.0, + "step": 9970 + }, + { + "epoch": 1.2684136878259764, + "grad_norm": 1.5365041494369507, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8558158278465271, + "num_tokens": 380619235.0, + "step": 9971 + }, + { + "epoch": 1.268540898104567, + "grad_norm": 1.427476406097412, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8908694982528687, + "num_tokens": 380661396.0, + "step": 9972 + }, + { + "epoch": 1.2686681083831575, + "grad_norm": 1.4075121879577637, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8836009502410889, + "num_tokens": 380701224.0, + "step": 9973 + }, + { + "epoch": 1.268795318661748, + "grad_norm": 1.448290467262268, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8794571161270142, + "num_tokens": 380737947.0, + "step": 9974 + }, + { + "epoch": 1.2689225289403383, + "grad_norm": 1.468124508857727, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8721306324005127, + "num_tokens": 380779805.0, + "step": 9975 + }, + { + "epoch": 1.2690497392189288, + "grad_norm": 1.537251353263855, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8832131028175354, + "num_tokens": 380816776.0, + "step": 9976 + }, + { + "epoch": 1.2691769494975194, + "grad_norm": 1.5983296632766724, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8896780014038086, + "num_tokens": 380849680.0, + "step": 9977 + }, + { + "epoch": 1.2693041597761099, + "grad_norm": 1.3766632080078125, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8780301809310913, + "num_tokens": 380893724.0, + "step": 9978 + }, + { + "epoch": 1.2694313700547004, + "grad_norm": 1.4794985055923462, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8766217231750488, + "num_tokens": 380933230.0, + "step": 9979 + }, + { + "epoch": 1.269558580333291, + "grad_norm": 1.4491013288497925, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8759942054748535, + "num_tokens": 380975088.0, + "step": 9980 + }, + { + "epoch": 1.2696857906118815, + "grad_norm": 1.5618953704833984, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.868291974067688, + "num_tokens": 381016161.0, + "step": 9981 + }, + { + "epoch": 1.269813000890472, + "grad_norm": 1.447788953781128, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8936188817024231, + "num_tokens": 381056090.0, + "step": 9982 + }, + { + "epoch": 1.2699402111690625, + "grad_norm": 1.6547082662582397, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8776760697364807, + "num_tokens": 381090774.0, + "step": 9983 + }, + { + "epoch": 1.2700674214476528, + "grad_norm": 1.5856618881225586, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8626665472984314, + "num_tokens": 381128218.0, + "step": 9984 + }, + { + "epoch": 1.2701946317262434, + "grad_norm": 1.5310001373291016, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.860762894153595, + "num_tokens": 381167633.0, + "step": 9985 + }, + { + "epoch": 1.270321842004834, + "grad_norm": 1.6643935441970825, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8766907453536987, + "num_tokens": 381204597.0, + "step": 9986 + }, + { + "epoch": 1.2704490522834244, + "grad_norm": 1.3693153858184814, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8905526399612427, + "num_tokens": 381242533.0, + "step": 9987 + }, + { + "epoch": 1.270576262562015, + "grad_norm": 1.4499382972717285, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8683045506477356, + "num_tokens": 381285148.0, + "step": 9988 + }, + { + "epoch": 1.2707034728406055, + "grad_norm": 1.5779516696929932, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.885283350944519, + "num_tokens": 381320011.0, + "step": 9989 + }, + { + "epoch": 1.270830683119196, + "grad_norm": 1.5660779476165771, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8678827285766602, + "num_tokens": 381356235.0, + "step": 9990 + }, + { + "epoch": 1.2709578933977865, + "grad_norm": 1.4216786623001099, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8873260021209717, + "num_tokens": 381396188.0, + "step": 9991 + }, + { + "epoch": 1.271085103676377, + "grad_norm": 1.6097521781921387, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8737412691116333, + "num_tokens": 381430994.0, + "step": 9992 + }, + { + "epoch": 1.2712123139549676, + "grad_norm": 1.7238813638687134, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.875207781791687, + "num_tokens": 381462869.0, + "step": 9993 + }, + { + "epoch": 1.2713395242335581, + "grad_norm": 1.701097846031189, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8757628202438354, + "num_tokens": 381494577.0, + "step": 9994 + }, + { + "epoch": 1.2714667345121486, + "grad_norm": 1.4289429187774658, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8789629936218262, + "num_tokens": 381535684.0, + "step": 9995 + }, + { + "epoch": 1.2715939447907392, + "grad_norm": 1.6071869134902954, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.89217209815979, + "num_tokens": 381567225.0, + "step": 9996 + }, + { + "epoch": 1.2717211550693297, + "grad_norm": 1.4953454732894897, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8633320331573486, + "num_tokens": 381610007.0, + "step": 9997 + }, + { + "epoch": 1.2718483653479202, + "grad_norm": 1.6104909181594849, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8694400787353516, + "num_tokens": 381650788.0, + "step": 9998 + }, + { + "epoch": 1.2719755756265108, + "grad_norm": 1.5064932107925415, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8707399964332581, + "num_tokens": 381692625.0, + "step": 9999 + }, + { + "epoch": 1.272102785905101, + "grad_norm": 1.4141108989715576, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.872745931148529, + "num_tokens": 381735848.0, + "step": 10000 + }, + { + "epoch": 1.2722299961836916, + "grad_norm": 1.6157008409500122, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8783323168754578, + "num_tokens": 381769633.0, + "step": 10001 + }, + { + "epoch": 1.2723572064622821, + "grad_norm": 1.5490552186965942, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8701457977294922, + "num_tokens": 381810253.0, + "step": 10002 + }, + { + "epoch": 1.2724844167408726, + "grad_norm": 1.5478185415267944, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8746199011802673, + "num_tokens": 381848344.0, + "step": 10003 + }, + { + "epoch": 1.2726116270194632, + "grad_norm": 1.4147731065750122, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8644826412200928, + "num_tokens": 381893383.0, + "step": 10004 + }, + { + "epoch": 1.2727388372980537, + "grad_norm": 1.456256628036499, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8741273880004883, + "num_tokens": 381935634.0, + "step": 10005 + }, + { + "epoch": 1.2728660475766442, + "grad_norm": 1.4891265630722046, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8745434880256653, + "num_tokens": 381972953.0, + "step": 10006 + }, + { + "epoch": 1.2729932578552348, + "grad_norm": 1.6152615547180176, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8653980493545532, + "num_tokens": 382010283.0, + "step": 10007 + }, + { + "epoch": 1.2731204681338253, + "grad_norm": 1.5065871477127075, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8903765678405762, + "num_tokens": 382046263.0, + "step": 10008 + }, + { + "epoch": 1.2732476784124156, + "grad_norm": 1.5209683179855347, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.872624933719635, + "num_tokens": 382087695.0, + "step": 10009 + }, + { + "epoch": 1.2733748886910061, + "grad_norm": 1.793524980545044, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.854120671749115, + "num_tokens": 382123226.0, + "step": 10010 + }, + { + "epoch": 1.2735020989695967, + "grad_norm": 1.5554863214492798, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8740413188934326, + "num_tokens": 382161772.0, + "step": 10011 + }, + { + "epoch": 1.2736293092481872, + "grad_norm": 1.6348845958709717, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8699414730072021, + "num_tokens": 382197895.0, + "step": 10012 + }, + { + "epoch": 1.2737565195267777, + "grad_norm": 1.5500293970108032, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8954846262931824, + "num_tokens": 382233327.0, + "step": 10013 + }, + { + "epoch": 1.2738837298053682, + "grad_norm": 1.4153831005096436, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8776575326919556, + "num_tokens": 382273215.0, + "step": 10014 + }, + { + "epoch": 1.2740109400839588, + "grad_norm": 1.4732640981674194, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8841754198074341, + "num_tokens": 382311037.0, + "step": 10015 + }, + { + "epoch": 1.2741381503625493, + "grad_norm": 1.6453860998153687, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8776904344558716, + "num_tokens": 382344864.0, + "step": 10016 + }, + { + "epoch": 1.2742653606411398, + "grad_norm": 1.5765159130096436, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8793148994445801, + "num_tokens": 382383065.0, + "step": 10017 + }, + { + "epoch": 1.2743925709197303, + "grad_norm": 1.5700147151947021, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8877480030059814, + "num_tokens": 382417347.0, + "step": 10018 + }, + { + "epoch": 1.2745197811983209, + "grad_norm": 1.546260952949524, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8750920295715332, + "num_tokens": 382454604.0, + "step": 10019 + }, + { + "epoch": 1.2746469914769114, + "grad_norm": 1.5378509759902954, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8909353613853455, + "num_tokens": 382487967.0, + "step": 10020 + }, + { + "epoch": 1.274774201755502, + "grad_norm": 1.495406150817871, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8765955567359924, + "num_tokens": 382526389.0, + "step": 10021 + }, + { + "epoch": 1.2749014120340925, + "grad_norm": 1.6234984397888184, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.888498067855835, + "num_tokens": 382561578.0, + "step": 10022 + }, + { + "epoch": 1.275028622312683, + "grad_norm": 1.4710341691970825, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8774635791778564, + "num_tokens": 382600159.0, + "step": 10023 + }, + { + "epoch": 1.2751558325912733, + "grad_norm": 1.5978859663009644, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8760728240013123, + "num_tokens": 382631887.0, + "step": 10024 + }, + { + "epoch": 1.2752830428698638, + "grad_norm": 1.4649405479431152, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8767760396003723, + "num_tokens": 382672341.0, + "step": 10025 + }, + { + "epoch": 1.2754102531484544, + "grad_norm": 1.450075626373291, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8810560703277588, + "num_tokens": 382714535.0, + "step": 10026 + }, + { + "epoch": 1.2755374634270449, + "grad_norm": 1.517348051071167, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8644508123397827, + "num_tokens": 382757661.0, + "step": 10027 + }, + { + "epoch": 1.2756646737056354, + "grad_norm": 1.5949363708496094, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8716400861740112, + "num_tokens": 382796759.0, + "step": 10028 + }, + { + "epoch": 1.275791883984226, + "grad_norm": 1.4953334331512451, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.876585841178894, + "num_tokens": 382835331.0, + "step": 10029 + }, + { + "epoch": 1.2759190942628165, + "grad_norm": 1.537305474281311, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8692759275436401, + "num_tokens": 382872053.0, + "step": 10030 + }, + { + "epoch": 1.276046304541407, + "grad_norm": 1.4765533208847046, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8803081512451172, + "num_tokens": 382914215.0, + "step": 10031 + }, + { + "epoch": 1.2761735148199975, + "grad_norm": 1.6190448999404907, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8780081272125244, + "num_tokens": 382946945.0, + "step": 10032 + }, + { + "epoch": 1.2763007250985878, + "grad_norm": 1.4522439241409302, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8824505805969238, + "num_tokens": 382987659.0, + "step": 10033 + }, + { + "epoch": 1.2764279353771784, + "grad_norm": 1.3754106760025024, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8746451735496521, + "num_tokens": 383032043.0, + "step": 10034 + }, + { + "epoch": 1.2765551456557689, + "grad_norm": 1.4569838047027588, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8830556273460388, + "num_tokens": 383069991.0, + "step": 10035 + }, + { + "epoch": 1.2766823559343594, + "grad_norm": 1.5063718557357788, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8801857233047485, + "num_tokens": 383104362.0, + "step": 10036 + }, + { + "epoch": 1.27680956621295, + "grad_norm": 1.4745780229568481, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8905419111251831, + "num_tokens": 383142760.0, + "step": 10037 + }, + { + "epoch": 1.2769367764915405, + "grad_norm": 1.603569507598877, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.87624591588974, + "num_tokens": 383178255.0, + "step": 10038 + }, + { + "epoch": 1.277063986770131, + "grad_norm": 1.5038446187973022, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8759607076644897, + "num_tokens": 383218729.0, + "step": 10039 + }, + { + "epoch": 1.2771911970487215, + "grad_norm": 1.5716310739517212, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8703494071960449, + "num_tokens": 383256652.0, + "step": 10040 + }, + { + "epoch": 1.277318407327312, + "grad_norm": 1.446740746498108, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.889119565486908, + "num_tokens": 383295307.0, + "step": 10041 + }, + { + "epoch": 1.2774456176059026, + "grad_norm": 1.5751179456710815, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8666620254516602, + "num_tokens": 383332355.0, + "step": 10042 + }, + { + "epoch": 1.2775728278844931, + "grad_norm": 1.4934805631637573, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8605579137802124, + "num_tokens": 383375193.0, + "step": 10043 + }, + { + "epoch": 1.2777000381630836, + "grad_norm": 1.6589258909225464, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8512964248657227, + "num_tokens": 383414316.0, + "step": 10044 + }, + { + "epoch": 1.2778272484416742, + "grad_norm": 1.5790646076202393, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8751453161239624, + "num_tokens": 383450726.0, + "step": 10045 + }, + { + "epoch": 1.2779544587202647, + "grad_norm": 1.6705114841461182, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8843069076538086, + "num_tokens": 383482663.0, + "step": 10046 + }, + { + "epoch": 1.2780816689988552, + "grad_norm": 1.4564177989959717, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8751561045646667, + "num_tokens": 383522455.0, + "step": 10047 + }, + { + "epoch": 1.2782088792774458, + "grad_norm": 1.516496181488037, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8666493892669678, + "num_tokens": 383561557.0, + "step": 10048 + }, + { + "epoch": 1.278336089556036, + "grad_norm": 1.3813248872756958, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8732265830039978, + "num_tokens": 383603409.0, + "step": 10049 + }, + { + "epoch": 1.2784632998346266, + "grad_norm": 1.5404961109161377, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8765681385993958, + "num_tokens": 383642095.0, + "step": 10050 + }, + { + "epoch": 1.2785905101132171, + "grad_norm": 1.441428303718567, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8765762448310852, + "num_tokens": 383680975.0, + "step": 10051 + }, + { + "epoch": 1.2787177203918076, + "grad_norm": 1.7075817584991455, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8604715466499329, + "num_tokens": 383714797.0, + "step": 10052 + }, + { + "epoch": 1.2788449306703982, + "grad_norm": 1.4541442394256592, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8704020380973816, + "num_tokens": 383757208.0, + "step": 10053 + }, + { + "epoch": 1.2789721409489887, + "grad_norm": 1.4892555475234985, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8724342584609985, + "num_tokens": 383797981.0, + "step": 10054 + }, + { + "epoch": 1.2790993512275792, + "grad_norm": 1.4973384141921997, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8672291040420532, + "num_tokens": 383837006.0, + "step": 10055 + }, + { + "epoch": 1.2792265615061698, + "grad_norm": 1.3942856788635254, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8868653178215027, + "num_tokens": 383875092.0, + "step": 10056 + }, + { + "epoch": 1.2793537717847603, + "grad_norm": 1.6259799003601074, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8679304718971252, + "num_tokens": 383908552.0, + "step": 10057 + }, + { + "epoch": 1.2794809820633506, + "grad_norm": 1.6272435188293457, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8589135408401489, + "num_tokens": 383946021.0, + "step": 10058 + }, + { + "epoch": 1.2796081923419411, + "grad_norm": 1.4957783222198486, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8696587681770325, + "num_tokens": 383985562.0, + "step": 10059 + }, + { + "epoch": 1.2797354026205316, + "grad_norm": 1.4240996837615967, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.879781186580658, + "num_tokens": 384028800.0, + "step": 10060 + }, + { + "epoch": 1.2798626128991222, + "grad_norm": 1.6790544986724854, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8703651428222656, + "num_tokens": 384059125.0, + "step": 10061 + }, + { + "epoch": 1.2799898231777127, + "grad_norm": 1.5171194076538086, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8767980933189392, + "num_tokens": 384093378.0, + "step": 10062 + }, + { + "epoch": 1.2801170334563032, + "grad_norm": 1.4256144762039185, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8666110038757324, + "num_tokens": 384139240.0, + "step": 10063 + }, + { + "epoch": 1.2802442437348938, + "grad_norm": 1.7374331951141357, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.87095046043396, + "num_tokens": 384170706.0, + "step": 10064 + }, + { + "epoch": 1.2803714540134843, + "grad_norm": 1.4963161945343018, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8790372014045715, + "num_tokens": 384208229.0, + "step": 10065 + }, + { + "epoch": 1.2804986642920748, + "grad_norm": 1.5040984153747559, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8825722932815552, + "num_tokens": 384246008.0, + "step": 10066 + }, + { + "epoch": 1.2806258745706653, + "grad_norm": 1.6483668088912964, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8790973424911499, + "num_tokens": 384277724.0, + "step": 10067 + }, + { + "epoch": 1.2807530848492559, + "grad_norm": 1.4878075122833252, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8841428160667419, + "num_tokens": 384315791.0, + "step": 10068 + }, + { + "epoch": 1.2808802951278464, + "grad_norm": 1.6030620336532593, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8507754802703857, + "num_tokens": 384354513.0, + "step": 10069 + }, + { + "epoch": 1.281007505406437, + "grad_norm": 1.4384995698928833, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8728537559509277, + "num_tokens": 384397794.0, + "step": 10070 + }, + { + "epoch": 1.2811347156850275, + "grad_norm": 1.729825735092163, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8620668649673462, + "num_tokens": 384431629.0, + "step": 10071 + }, + { + "epoch": 1.281261925963618, + "grad_norm": 1.4349656105041504, + "learning_rate": 1e-06, + "loss": 0.2455, + "mean_token_accuracy": 0.9114527106285095, + "num_tokens": 384468379.0, + "step": 10072 + }, + { + "epoch": 1.2813891362422083, + "grad_norm": 1.5578725337982178, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8746222853660583, + "num_tokens": 384506580.0, + "step": 10073 + }, + { + "epoch": 1.2815163465207988, + "grad_norm": 1.5301443338394165, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8782482147216797, + "num_tokens": 384542136.0, + "step": 10074 + }, + { + "epoch": 1.2816435567993893, + "grad_norm": 1.5652967691421509, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.863469123840332, + "num_tokens": 384580843.0, + "step": 10075 + }, + { + "epoch": 1.2817707670779799, + "grad_norm": 1.6957104206085205, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8705127239227295, + "num_tokens": 384613807.0, + "step": 10076 + }, + { + "epoch": 1.2818979773565704, + "grad_norm": 1.454301357269287, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.867352306842804, + "num_tokens": 384655136.0, + "step": 10077 + }, + { + "epoch": 1.282025187635161, + "grad_norm": 1.5537347793579102, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8725050687789917, + "num_tokens": 384692878.0, + "step": 10078 + }, + { + "epoch": 1.2821523979137515, + "grad_norm": 1.537489652633667, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8589735627174377, + "num_tokens": 384732331.0, + "step": 10079 + }, + { + "epoch": 1.282279608192342, + "grad_norm": 1.597437858581543, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.861338198184967, + "num_tokens": 384769239.0, + "step": 10080 + }, + { + "epoch": 1.2824068184709325, + "grad_norm": 1.578169822692871, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8738829493522644, + "num_tokens": 384808239.0, + "step": 10081 + }, + { + "epoch": 1.2825340287495228, + "grad_norm": 1.5036123991012573, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8758043646812439, + "num_tokens": 384846961.0, + "step": 10082 + }, + { + "epoch": 1.2826612390281134, + "grad_norm": 1.6248574256896973, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8880100250244141, + "num_tokens": 384877356.0, + "step": 10083 + }, + { + "epoch": 1.2827884493067039, + "grad_norm": 1.4127200841903687, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8821632266044617, + "num_tokens": 384919878.0, + "step": 10084 + }, + { + "epoch": 1.2829156595852944, + "grad_norm": 1.5721814632415771, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8743318319320679, + "num_tokens": 384957907.0, + "step": 10085 + }, + { + "epoch": 1.283042869863885, + "grad_norm": 1.517478108406067, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8812575340270996, + "num_tokens": 384995817.0, + "step": 10086 + }, + { + "epoch": 1.2831700801424755, + "grad_norm": 1.432058334350586, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8907380104064941, + "num_tokens": 385031381.0, + "step": 10087 + }, + { + "epoch": 1.283297290421066, + "grad_norm": 1.609893560409546, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8798822164535522, + "num_tokens": 385067721.0, + "step": 10088 + }, + { + "epoch": 1.2834245006996565, + "grad_norm": 1.4992486238479614, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8724310398101807, + "num_tokens": 385108407.0, + "step": 10089 + }, + { + "epoch": 1.283551710978247, + "grad_norm": 1.5090115070343018, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.865187406539917, + "num_tokens": 385146427.0, + "step": 10090 + }, + { + "epoch": 1.2836789212568376, + "grad_norm": 1.5855176448822021, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8706462383270264, + "num_tokens": 385180311.0, + "step": 10091 + }, + { + "epoch": 1.283806131535428, + "grad_norm": 1.476169466972351, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8696390390396118, + "num_tokens": 385219957.0, + "step": 10092 + }, + { + "epoch": 1.2839333418140186, + "grad_norm": 1.6127643585205078, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8663889765739441, + "num_tokens": 385258891.0, + "step": 10093 + }, + { + "epoch": 1.2840605520926092, + "grad_norm": 1.3843096494674683, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8827025294303894, + "num_tokens": 385302844.0, + "step": 10094 + }, + { + "epoch": 1.2841877623711997, + "grad_norm": 1.4600529670715332, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8802611827850342, + "num_tokens": 385341250.0, + "step": 10095 + }, + { + "epoch": 1.2843149726497902, + "grad_norm": 1.5163321495056152, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8807675838470459, + "num_tokens": 385380927.0, + "step": 10096 + }, + { + "epoch": 1.2844421829283807, + "grad_norm": 1.5995662212371826, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8738971948623657, + "num_tokens": 385414829.0, + "step": 10097 + }, + { + "epoch": 1.284569393206971, + "grad_norm": 1.5515661239624023, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8794291019439697, + "num_tokens": 385452211.0, + "step": 10098 + }, + { + "epoch": 1.2846966034855616, + "grad_norm": 1.5610252618789673, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8666291236877441, + "num_tokens": 385488697.0, + "step": 10099 + }, + { + "epoch": 1.2848238137641521, + "grad_norm": 1.546371340751648, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8814085721969604, + "num_tokens": 385522573.0, + "step": 10100 + }, + { + "epoch": 1.2849510240427426, + "grad_norm": 1.5344388484954834, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8720438480377197, + "num_tokens": 385558817.0, + "step": 10101 + }, + { + "epoch": 1.2850782343213332, + "grad_norm": 1.409374713897705, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8704769611358643, + "num_tokens": 385598371.0, + "step": 10102 + }, + { + "epoch": 1.2852054445999237, + "grad_norm": 1.4735982418060303, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8845367431640625, + "num_tokens": 385637748.0, + "step": 10103 + }, + { + "epoch": 1.2853326548785142, + "grad_norm": 1.5461981296539307, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8811181783676147, + "num_tokens": 385674082.0, + "step": 10104 + }, + { + "epoch": 1.2854598651571048, + "grad_norm": 1.5140759944915771, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8732221126556396, + "num_tokens": 385711906.0, + "step": 10105 + }, + { + "epoch": 1.2855870754356953, + "grad_norm": 1.5393657684326172, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8772696256637573, + "num_tokens": 385750014.0, + "step": 10106 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 1.5258475542068481, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8728331923484802, + "num_tokens": 385785171.0, + "step": 10107 + }, + { + "epoch": 1.2858414959928761, + "grad_norm": 1.5251624584197998, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8657110929489136, + "num_tokens": 385823401.0, + "step": 10108 + }, + { + "epoch": 1.2859687062714666, + "grad_norm": 1.5798572301864624, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8555279970169067, + "num_tokens": 385861568.0, + "step": 10109 + }, + { + "epoch": 1.2860959165500572, + "grad_norm": 1.3697803020477295, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8955304622650146, + "num_tokens": 385898929.0, + "step": 10110 + }, + { + "epoch": 1.2862231268286477, + "grad_norm": 1.5762981176376343, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8804899454116821, + "num_tokens": 385932585.0, + "step": 10111 + }, + { + "epoch": 1.2863503371072382, + "grad_norm": 1.5689648389816284, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8693180680274963, + "num_tokens": 385972803.0, + "step": 10112 + }, + { + "epoch": 1.2864775473858288, + "grad_norm": 1.505899429321289, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8663414716720581, + "num_tokens": 386014060.0, + "step": 10113 + }, + { + "epoch": 1.2866047576644193, + "grad_norm": 1.5459215641021729, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8762263059616089, + "num_tokens": 386051900.0, + "step": 10114 + }, + { + "epoch": 1.2867319679430098, + "grad_norm": 1.589888334274292, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8825817108154297, + "num_tokens": 386088625.0, + "step": 10115 + }, + { + "epoch": 1.2868591782216003, + "grad_norm": 1.5379278659820557, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8769505023956299, + "num_tokens": 386124144.0, + "step": 10116 + }, + { + "epoch": 1.2869863885001909, + "grad_norm": 1.6086299419403076, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8873810768127441, + "num_tokens": 386156441.0, + "step": 10117 + }, + { + "epoch": 1.2871135987787814, + "grad_norm": 1.5565203428268433, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8778129816055298, + "num_tokens": 386192533.0, + "step": 10118 + }, + { + "epoch": 1.287240809057372, + "grad_norm": 1.4921481609344482, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8773915767669678, + "num_tokens": 386231474.0, + "step": 10119 + }, + { + "epoch": 1.2873680193359625, + "grad_norm": 1.6240750551223755, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8664453029632568, + "num_tokens": 386266706.0, + "step": 10120 + }, + { + "epoch": 1.287495229614553, + "grad_norm": 1.5026599168777466, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8695586323738098, + "num_tokens": 386309067.0, + "step": 10121 + }, + { + "epoch": 1.2876224398931433, + "grad_norm": 1.4775298833847046, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8708009123802185, + "num_tokens": 386346704.0, + "step": 10122 + }, + { + "epoch": 1.2877496501717338, + "grad_norm": 1.5757725238800049, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8799484968185425, + "num_tokens": 386379647.0, + "step": 10123 + }, + { + "epoch": 1.2878768604503243, + "grad_norm": 1.459230899810791, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8820937275886536, + "num_tokens": 386420825.0, + "step": 10124 + }, + { + "epoch": 1.2880040707289149, + "grad_norm": 1.5637282133102417, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8785978555679321, + "num_tokens": 386457077.0, + "step": 10125 + }, + { + "epoch": 1.2881312810075054, + "grad_norm": 1.4902307987213135, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8742529153823853, + "num_tokens": 386494602.0, + "step": 10126 + }, + { + "epoch": 1.288258491286096, + "grad_norm": 1.526222825050354, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8687608242034912, + "num_tokens": 386531064.0, + "step": 10127 + }, + { + "epoch": 1.2883857015646865, + "grad_norm": 1.3692270517349243, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8836061358451843, + "num_tokens": 386572372.0, + "step": 10128 + }, + { + "epoch": 1.288512911843277, + "grad_norm": 1.5395954847335815, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8771020770072937, + "num_tokens": 386607659.0, + "step": 10129 + }, + { + "epoch": 1.2886401221218675, + "grad_norm": 1.5899425745010376, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8751223683357239, + "num_tokens": 386641322.0, + "step": 10130 + }, + { + "epoch": 1.2887673324004578, + "grad_norm": 1.591418743133545, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8856812715530396, + "num_tokens": 386675986.0, + "step": 10131 + }, + { + "epoch": 1.2888945426790483, + "grad_norm": 1.4855808019638062, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8864964246749878, + "num_tokens": 386713651.0, + "step": 10132 + }, + { + "epoch": 1.2890217529576389, + "grad_norm": 1.6519396305084229, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8755658864974976, + "num_tokens": 386745817.0, + "step": 10133 + }, + { + "epoch": 1.2891489632362294, + "grad_norm": 1.5740513801574707, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8804775476455688, + "num_tokens": 386776516.0, + "step": 10134 + }, + { + "epoch": 1.28927617351482, + "grad_norm": 1.4572997093200684, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.8965744376182556, + "num_tokens": 386813100.0, + "step": 10135 + }, + { + "epoch": 1.2894033837934105, + "grad_norm": 1.4992427825927734, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8785735368728638, + "num_tokens": 386852083.0, + "step": 10136 + }, + { + "epoch": 1.289530594072001, + "grad_norm": 1.6062541007995605, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8688272833824158, + "num_tokens": 386888257.0, + "step": 10137 + }, + { + "epoch": 1.2896578043505915, + "grad_norm": 1.6367647647857666, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8696205615997314, + "num_tokens": 386920762.0, + "step": 10138 + }, + { + "epoch": 1.289785014629182, + "grad_norm": 1.544060468673706, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8769039511680603, + "num_tokens": 386958392.0, + "step": 10139 + }, + { + "epoch": 1.2899122249077726, + "grad_norm": 1.6422123908996582, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8745272159576416, + "num_tokens": 386991435.0, + "step": 10140 + }, + { + "epoch": 1.290039435186363, + "grad_norm": 1.5172828435897827, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8696656227111816, + "num_tokens": 387032043.0, + "step": 10141 + }, + { + "epoch": 1.2901666454649536, + "grad_norm": 1.4688925743103027, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8710191249847412, + "num_tokens": 387073764.0, + "step": 10142 + }, + { + "epoch": 1.2902938557435442, + "grad_norm": 1.408367395401001, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8857091665267944, + "num_tokens": 387111563.0, + "step": 10143 + }, + { + "epoch": 1.2904210660221347, + "grad_norm": 1.7585877180099487, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8782905340194702, + "num_tokens": 387146077.0, + "step": 10144 + }, + { + "epoch": 1.2905482763007252, + "grad_norm": 1.5719906091690063, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8618820905685425, + "num_tokens": 387183320.0, + "step": 10145 + }, + { + "epoch": 1.2906754865793157, + "grad_norm": 1.4224064350128174, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8670315146446228, + "num_tokens": 387226571.0, + "step": 10146 + }, + { + "epoch": 1.290802696857906, + "grad_norm": 1.441551685333252, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8730133771896362, + "num_tokens": 387265959.0, + "step": 10147 + }, + { + "epoch": 1.2909299071364966, + "grad_norm": 1.4557439088821411, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8749209642410278, + "num_tokens": 387307702.0, + "step": 10148 + }, + { + "epoch": 1.291057117415087, + "grad_norm": 1.5681037902832031, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.870867133140564, + "num_tokens": 387346089.0, + "step": 10149 + }, + { + "epoch": 1.2911843276936776, + "grad_norm": 1.438033103942871, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8785400390625, + "num_tokens": 387388280.0, + "step": 10150 + }, + { + "epoch": 1.2913115379722682, + "grad_norm": 1.5809599161148071, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8744670152664185, + "num_tokens": 387426945.0, + "step": 10151 + }, + { + "epoch": 1.2914387482508587, + "grad_norm": 1.4639592170715332, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.880312979221344, + "num_tokens": 387468216.0, + "step": 10152 + }, + { + "epoch": 1.2915659585294492, + "grad_norm": 1.4465360641479492, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.870402455329895, + "num_tokens": 387512874.0, + "step": 10153 + }, + { + "epoch": 1.2916931688080397, + "grad_norm": 1.4560405015945435, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8652383089065552, + "num_tokens": 387552489.0, + "step": 10154 + }, + { + "epoch": 1.2918203790866303, + "grad_norm": 1.6131325960159302, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8792446851730347, + "num_tokens": 387585149.0, + "step": 10155 + }, + { + "epoch": 1.2919475893652206, + "grad_norm": 1.492504358291626, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8773477673530579, + "num_tokens": 387622371.0, + "step": 10156 + }, + { + "epoch": 1.2920747996438111, + "grad_norm": 1.484884262084961, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.886953592300415, + "num_tokens": 387658738.0, + "step": 10157 + }, + { + "epoch": 1.2922020099224016, + "grad_norm": 1.585580587387085, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8791062235832214, + "num_tokens": 387693146.0, + "step": 10158 + }, + { + "epoch": 1.2923292202009922, + "grad_norm": 1.573183536529541, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8665401339530945, + "num_tokens": 387732092.0, + "step": 10159 + }, + { + "epoch": 1.2924564304795827, + "grad_norm": 1.457953929901123, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8740627765655518, + "num_tokens": 387770833.0, + "step": 10160 + }, + { + "epoch": 1.2925836407581732, + "grad_norm": 1.6497070789337158, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8649343252182007, + "num_tokens": 387805638.0, + "step": 10161 + }, + { + "epoch": 1.2927108510367638, + "grad_norm": 1.723455786705017, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8545562028884888, + "num_tokens": 387837935.0, + "step": 10162 + }, + { + "epoch": 1.2928380613153543, + "grad_norm": 1.4432275295257568, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8772352337837219, + "num_tokens": 387878763.0, + "step": 10163 + }, + { + "epoch": 1.2929652715939448, + "grad_norm": 1.4151712656021118, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8654873371124268, + "num_tokens": 387923936.0, + "step": 10164 + }, + { + "epoch": 1.2930924818725353, + "grad_norm": 1.4717282056808472, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8710060715675354, + "num_tokens": 387965675.0, + "step": 10165 + }, + { + "epoch": 1.2932196921511259, + "grad_norm": 1.5282628536224365, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8636392951011658, + "num_tokens": 388004162.0, + "step": 10166 + }, + { + "epoch": 1.2933469024297164, + "grad_norm": 1.4539580345153809, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8847291469573975, + "num_tokens": 388043223.0, + "step": 10167 + }, + { + "epoch": 1.293474112708307, + "grad_norm": 1.5557714700698853, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8642925024032593, + "num_tokens": 388080528.0, + "step": 10168 + }, + { + "epoch": 1.2936013229868975, + "grad_norm": 1.443655014038086, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8831494450569153, + "num_tokens": 388120112.0, + "step": 10169 + }, + { + "epoch": 1.293728533265488, + "grad_norm": 1.5131313800811768, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.872288167476654, + "num_tokens": 388158318.0, + "step": 10170 + }, + { + "epoch": 1.2938557435440783, + "grad_norm": 1.5112690925598145, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8697291612625122, + "num_tokens": 388196683.0, + "step": 10171 + }, + { + "epoch": 1.2939829538226688, + "grad_norm": 1.543142318725586, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8824365139007568, + "num_tokens": 388235156.0, + "step": 10172 + }, + { + "epoch": 1.2941101641012593, + "grad_norm": 1.4720110893249512, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8765767812728882, + "num_tokens": 388274810.0, + "step": 10173 + }, + { + "epoch": 1.2942373743798499, + "grad_norm": 1.4625217914581299, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8758907914161682, + "num_tokens": 388314496.0, + "step": 10174 + }, + { + "epoch": 1.2943645846584404, + "grad_norm": 1.3979125022888184, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8806225061416626, + "num_tokens": 388355047.0, + "step": 10175 + }, + { + "epoch": 1.294491794937031, + "grad_norm": 1.520666241645813, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8744394779205322, + "num_tokens": 388392011.0, + "step": 10176 + }, + { + "epoch": 1.2946190052156215, + "grad_norm": 1.5564351081848145, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8512033224105835, + "num_tokens": 388432549.0, + "step": 10177 + }, + { + "epoch": 1.294746215494212, + "grad_norm": 1.611016869544983, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8783775568008423, + "num_tokens": 388468528.0, + "step": 10178 + }, + { + "epoch": 1.2948734257728025, + "grad_norm": 1.5071873664855957, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8770822882652283, + "num_tokens": 388503722.0, + "step": 10179 + }, + { + "epoch": 1.2950006360513928, + "grad_norm": 1.449501395225525, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8786450028419495, + "num_tokens": 388544652.0, + "step": 10180 + }, + { + "epoch": 1.2951278463299833, + "grad_norm": 1.7930711507797241, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8450921773910522, + "num_tokens": 388580444.0, + "step": 10181 + }, + { + "epoch": 1.2952550566085739, + "grad_norm": 1.5449248552322388, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8743583559989929, + "num_tokens": 388620380.0, + "step": 10182 + }, + { + "epoch": 1.2953822668871644, + "grad_norm": 1.5581707954406738, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.860878586769104, + "num_tokens": 388661440.0, + "step": 10183 + }, + { + "epoch": 1.295509477165755, + "grad_norm": 1.5074119567871094, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8714408278465271, + "num_tokens": 388701201.0, + "step": 10184 + }, + { + "epoch": 1.2956366874443455, + "grad_norm": 1.4244831800460815, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8684457540512085, + "num_tokens": 388746168.0, + "step": 10185 + }, + { + "epoch": 1.295763897722936, + "grad_norm": 1.4883712530136108, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8836215734481812, + "num_tokens": 388782203.0, + "step": 10186 + }, + { + "epoch": 1.2958911080015265, + "grad_norm": 1.4590961933135986, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8790892362594604, + "num_tokens": 388821613.0, + "step": 10187 + }, + { + "epoch": 1.296018318280117, + "grad_norm": 1.5655187368392944, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8812559247016907, + "num_tokens": 388860885.0, + "step": 10188 + }, + { + "epoch": 1.2961455285587076, + "grad_norm": 1.4389400482177734, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8730543851852417, + "num_tokens": 388903267.0, + "step": 10189 + }, + { + "epoch": 1.296272738837298, + "grad_norm": 1.4895192384719849, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.878191351890564, + "num_tokens": 388941346.0, + "step": 10190 + }, + { + "epoch": 1.2963999491158886, + "grad_norm": 1.5591493844985962, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8834782838821411, + "num_tokens": 388975219.0, + "step": 10191 + }, + { + "epoch": 1.2965271593944792, + "grad_norm": 1.5957072973251343, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8683778047561646, + "num_tokens": 389011984.0, + "step": 10192 + }, + { + "epoch": 1.2966543696730697, + "grad_norm": 1.5727969408035278, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8755123019218445, + "num_tokens": 389051134.0, + "step": 10193 + }, + { + "epoch": 1.2967815799516602, + "grad_norm": 1.571075201034546, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8804934024810791, + "num_tokens": 389086140.0, + "step": 10194 + }, + { + "epoch": 1.2969087902302507, + "grad_norm": 1.6132069826126099, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.875473141670227, + "num_tokens": 389119515.0, + "step": 10195 + }, + { + "epoch": 1.297036000508841, + "grad_norm": 1.6494781970977783, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8698104619979858, + "num_tokens": 389153713.0, + "step": 10196 + }, + { + "epoch": 1.2971632107874316, + "grad_norm": 1.5477514266967773, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.859591007232666, + "num_tokens": 389193118.0, + "step": 10197 + }, + { + "epoch": 1.297290421066022, + "grad_norm": 1.589261770248413, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8537227511405945, + "num_tokens": 389229911.0, + "step": 10198 + }, + { + "epoch": 1.2974176313446126, + "grad_norm": 1.5077637434005737, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8763132095336914, + "num_tokens": 389267160.0, + "step": 10199 + }, + { + "epoch": 1.2975448416232032, + "grad_norm": 1.899893879890442, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8580154776573181, + "num_tokens": 389299024.0, + "step": 10200 + }, + { + "epoch": 1.2976720519017937, + "grad_norm": 1.4944194555282593, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8864521384239197, + "num_tokens": 389337864.0, + "step": 10201 + }, + { + "epoch": 1.2977992621803842, + "grad_norm": 1.7083696126937866, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8661468625068665, + "num_tokens": 389374920.0, + "step": 10202 + }, + { + "epoch": 1.2979264724589747, + "grad_norm": 1.5713492631912231, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.880028247833252, + "num_tokens": 389412843.0, + "step": 10203 + }, + { + "epoch": 1.2980536827375653, + "grad_norm": 1.5343618392944336, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8899012804031372, + "num_tokens": 389444268.0, + "step": 10204 + }, + { + "epoch": 1.2981808930161556, + "grad_norm": 1.6038180589675903, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8812322020530701, + "num_tokens": 389477798.0, + "step": 10205 + }, + { + "epoch": 1.298308103294746, + "grad_norm": 1.604369044303894, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8640058040618896, + "num_tokens": 389518968.0, + "step": 10206 + }, + { + "epoch": 1.2984353135733366, + "grad_norm": 1.6906465291976929, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8660291433334351, + "num_tokens": 389554823.0, + "step": 10207 + }, + { + "epoch": 1.2985625238519272, + "grad_norm": 1.6175285577774048, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8743951320648193, + "num_tokens": 389588651.0, + "step": 10208 + }, + { + "epoch": 1.2986897341305177, + "grad_norm": 1.4613536596298218, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8706769943237305, + "num_tokens": 389629685.0, + "step": 10209 + }, + { + "epoch": 1.2988169444091082, + "grad_norm": 1.5940886735916138, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.85378098487854, + "num_tokens": 389667334.0, + "step": 10210 + }, + { + "epoch": 1.2989441546876987, + "grad_norm": 1.4699233770370483, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8804782032966614, + "num_tokens": 389705871.0, + "step": 10211 + }, + { + "epoch": 1.2990713649662893, + "grad_norm": 1.5356389284133911, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8883124589920044, + "num_tokens": 389740603.0, + "step": 10212 + }, + { + "epoch": 1.2991985752448798, + "grad_norm": 1.4587392807006836, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8860005140304565, + "num_tokens": 389779875.0, + "step": 10213 + }, + { + "epoch": 1.2993257855234703, + "grad_norm": 1.5412942171096802, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.866112232208252, + "num_tokens": 389827427.0, + "step": 10214 + }, + { + "epoch": 1.2994529958020609, + "grad_norm": 1.5529786348342896, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8681203126907349, + "num_tokens": 389868860.0, + "step": 10215 + }, + { + "epoch": 1.2995802060806514, + "grad_norm": 1.4161474704742432, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8845602869987488, + "num_tokens": 389909639.0, + "step": 10216 + }, + { + "epoch": 1.299707416359242, + "grad_norm": 1.5579049587249756, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.886437714099884, + "num_tokens": 389944387.0, + "step": 10217 + }, + { + "epoch": 1.2998346266378324, + "grad_norm": 1.6839758157730103, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8744161128997803, + "num_tokens": 389977363.0, + "step": 10218 + }, + { + "epoch": 1.299961836916423, + "grad_norm": 1.5262646675109863, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8743857145309448, + "num_tokens": 390015660.0, + "step": 10219 + }, + { + "epoch": 1.3000890471950133, + "grad_norm": 1.6019247770309448, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8707438707351685, + "num_tokens": 390053682.0, + "step": 10220 + }, + { + "epoch": 1.3002162574736038, + "grad_norm": 1.4149441719055176, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8841525316238403, + "num_tokens": 390093506.0, + "step": 10221 + }, + { + "epoch": 1.3003434677521943, + "grad_norm": 1.5073094367980957, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8865865468978882, + "num_tokens": 390127688.0, + "step": 10222 + }, + { + "epoch": 1.3004706780307849, + "grad_norm": 1.3765664100646973, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8828601241111755, + "num_tokens": 390170254.0, + "step": 10223 + }, + { + "epoch": 1.3005978883093754, + "grad_norm": 1.477420449256897, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8695600032806396, + "num_tokens": 390214746.0, + "step": 10224 + }, + { + "epoch": 1.300725098587966, + "grad_norm": 1.5000756978988647, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8864325284957886, + "num_tokens": 390249131.0, + "step": 10225 + }, + { + "epoch": 1.3008523088665565, + "grad_norm": 1.4969029426574707, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.877819836139679, + "num_tokens": 390292266.0, + "step": 10226 + }, + { + "epoch": 1.300979519145147, + "grad_norm": 1.4136078357696533, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8729314208030701, + "num_tokens": 390335642.0, + "step": 10227 + }, + { + "epoch": 1.3011067294237375, + "grad_norm": 1.5529011487960815, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8803001642227173, + "num_tokens": 390370754.0, + "step": 10228 + }, + { + "epoch": 1.3012339397023278, + "grad_norm": 1.408177137374878, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8767151832580566, + "num_tokens": 390416434.0, + "step": 10229 + }, + { + "epoch": 1.3013611499809183, + "grad_norm": 1.6148681640625, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8714903593063354, + "num_tokens": 390450756.0, + "step": 10230 + }, + { + "epoch": 1.3014883602595089, + "grad_norm": 1.376413106918335, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8820360898971558, + "num_tokens": 390493961.0, + "step": 10231 + }, + { + "epoch": 1.3016155705380994, + "grad_norm": 1.41788911819458, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8790897130966187, + "num_tokens": 390539075.0, + "step": 10232 + }, + { + "epoch": 1.30174278081669, + "grad_norm": 1.5655782222747803, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8707948327064514, + "num_tokens": 390578851.0, + "step": 10233 + }, + { + "epoch": 1.3018699910952805, + "grad_norm": 1.5729893445968628, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8755307197570801, + "num_tokens": 390615151.0, + "step": 10234 + }, + { + "epoch": 1.301997201373871, + "grad_norm": 1.503214716911316, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8699830770492554, + "num_tokens": 390657848.0, + "step": 10235 + }, + { + "epoch": 1.3021244116524615, + "grad_norm": 1.493805170059204, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8846340179443359, + "num_tokens": 390694776.0, + "step": 10236 + }, + { + "epoch": 1.302251621931052, + "grad_norm": 1.625659704208374, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8783724904060364, + "num_tokens": 390728859.0, + "step": 10237 + }, + { + "epoch": 1.3023788322096426, + "grad_norm": 1.4528615474700928, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.881068229675293, + "num_tokens": 390769197.0, + "step": 10238 + }, + { + "epoch": 1.302506042488233, + "grad_norm": 1.4624767303466797, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8739765882492065, + "num_tokens": 390807847.0, + "step": 10239 + }, + { + "epoch": 1.3026332527668236, + "grad_norm": 1.707780361175537, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8533022403717041, + "num_tokens": 390843653.0, + "step": 10240 + }, + { + "epoch": 1.3027604630454142, + "grad_norm": 1.390100359916687, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8814104199409485, + "num_tokens": 390883853.0, + "step": 10241 + }, + { + "epoch": 1.3028876733240047, + "grad_norm": 1.4604601860046387, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8781492710113525, + "num_tokens": 390925093.0, + "step": 10242 + }, + { + "epoch": 1.3030148836025952, + "grad_norm": 1.6552342176437378, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.872651219367981, + "num_tokens": 390957908.0, + "step": 10243 + }, + { + "epoch": 1.3031420938811857, + "grad_norm": 1.5297642946243286, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8690365552902222, + "num_tokens": 390995843.0, + "step": 10244 + }, + { + "epoch": 1.303269304159776, + "grad_norm": 1.500299334526062, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8883906602859497, + "num_tokens": 391029282.0, + "step": 10245 + }, + { + "epoch": 1.3033965144383666, + "grad_norm": 1.5137970447540283, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8729338645935059, + "num_tokens": 391067126.0, + "step": 10246 + }, + { + "epoch": 1.303523724716957, + "grad_norm": 1.5747289657592773, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8738654851913452, + "num_tokens": 391103761.0, + "step": 10247 + }, + { + "epoch": 1.3036509349955476, + "grad_norm": 1.6430879831314087, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8794808387756348, + "num_tokens": 391144724.0, + "step": 10248 + }, + { + "epoch": 1.3037781452741382, + "grad_norm": 1.6313999891281128, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8777133822441101, + "num_tokens": 391179544.0, + "step": 10249 + }, + { + "epoch": 1.3039053555527287, + "grad_norm": 1.5981121063232422, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8866885304450989, + "num_tokens": 391215132.0, + "step": 10250 + }, + { + "epoch": 1.3040325658313192, + "grad_norm": 1.6021252870559692, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8625729084014893, + "num_tokens": 391253161.0, + "step": 10251 + }, + { + "epoch": 1.3041597761099097, + "grad_norm": 1.8376893997192383, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8579404950141907, + "num_tokens": 391282523.0, + "step": 10252 + }, + { + "epoch": 1.3042869863885003, + "grad_norm": 1.660886526107788, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8813585042953491, + "num_tokens": 391314134.0, + "step": 10253 + }, + { + "epoch": 1.3044141966670906, + "grad_norm": 1.5573917627334595, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8701604008674622, + "num_tokens": 391352301.0, + "step": 10254 + }, + { + "epoch": 1.304541406945681, + "grad_norm": 1.6078431606292725, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8751425743103027, + "num_tokens": 391390851.0, + "step": 10255 + }, + { + "epoch": 1.3046686172242716, + "grad_norm": 1.4642281532287598, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8743517398834229, + "num_tokens": 391433278.0, + "step": 10256 + }, + { + "epoch": 1.3047958275028622, + "grad_norm": 1.550818920135498, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8729441165924072, + "num_tokens": 391473366.0, + "step": 10257 + }, + { + "epoch": 1.3049230377814527, + "grad_norm": 1.4607456922531128, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8746482729911804, + "num_tokens": 391513419.0, + "step": 10258 + }, + { + "epoch": 1.3050502480600432, + "grad_norm": 1.5434356927871704, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8706759214401245, + "num_tokens": 391553161.0, + "step": 10259 + }, + { + "epoch": 1.3051774583386337, + "grad_norm": 1.5521767139434814, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8637645840644836, + "num_tokens": 391591167.0, + "step": 10260 + }, + { + "epoch": 1.3053046686172243, + "grad_norm": 1.4297592639923096, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8737806677818298, + "num_tokens": 391631480.0, + "step": 10261 + }, + { + "epoch": 1.3054318788958148, + "grad_norm": 1.537418007850647, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8738679885864258, + "num_tokens": 391667670.0, + "step": 10262 + }, + { + "epoch": 1.3055590891744053, + "grad_norm": 1.4937806129455566, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8730599880218506, + "num_tokens": 391706236.0, + "step": 10263 + }, + { + "epoch": 1.3056862994529959, + "grad_norm": 1.5952593088150024, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8734792470932007, + "num_tokens": 391742916.0, + "step": 10264 + }, + { + "epoch": 1.3058135097315864, + "grad_norm": 1.477231740951538, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8781106472015381, + "num_tokens": 391779038.0, + "step": 10265 + }, + { + "epoch": 1.305940720010177, + "grad_norm": 1.4247186183929443, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8851574063301086, + "num_tokens": 391817937.0, + "step": 10266 + }, + { + "epoch": 1.3060679302887674, + "grad_norm": 1.4877426624298096, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8734085559844971, + "num_tokens": 391858775.0, + "step": 10267 + }, + { + "epoch": 1.306195140567358, + "grad_norm": 1.4118118286132812, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.886908769607544, + "num_tokens": 391897344.0, + "step": 10268 + }, + { + "epoch": 1.3063223508459483, + "grad_norm": 1.5817418098449707, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8741124868392944, + "num_tokens": 391931997.0, + "step": 10269 + }, + { + "epoch": 1.3064495611245388, + "grad_norm": 1.594553828239441, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8675270676612854, + "num_tokens": 391968299.0, + "step": 10270 + }, + { + "epoch": 1.3065767714031293, + "grad_norm": 1.5682470798492432, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8581646680831909, + "num_tokens": 392005498.0, + "step": 10271 + }, + { + "epoch": 1.3067039816817199, + "grad_norm": 1.5643668174743652, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8758927583694458, + "num_tokens": 392041103.0, + "step": 10272 + }, + { + "epoch": 1.3068311919603104, + "grad_norm": 1.5758363008499146, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.873974084854126, + "num_tokens": 392076642.0, + "step": 10273 + }, + { + "epoch": 1.306958402238901, + "grad_norm": 1.4114700555801392, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8757132887840271, + "num_tokens": 392120778.0, + "step": 10274 + }, + { + "epoch": 1.3070856125174914, + "grad_norm": 1.5564988851547241, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8796370029449463, + "num_tokens": 392154382.0, + "step": 10275 + }, + { + "epoch": 1.307212822796082, + "grad_norm": 1.5055537223815918, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8742082118988037, + "num_tokens": 392191424.0, + "step": 10276 + }, + { + "epoch": 1.3073400330746725, + "grad_norm": 1.5378344058990479, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8810657858848572, + "num_tokens": 392226030.0, + "step": 10277 + }, + { + "epoch": 1.3074672433532628, + "grad_norm": 1.5933153629302979, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8793396949768066, + "num_tokens": 392266915.0, + "step": 10278 + }, + { + "epoch": 1.3075944536318533, + "grad_norm": 1.499437689781189, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8681215047836304, + "num_tokens": 392306656.0, + "step": 10279 + }, + { + "epoch": 1.3077216639104439, + "grad_norm": 1.3667137622833252, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8897111415863037, + "num_tokens": 392349229.0, + "step": 10280 + }, + { + "epoch": 1.3078488741890344, + "grad_norm": 1.6304523944854736, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8666666150093079, + "num_tokens": 392385052.0, + "step": 10281 + }, + { + "epoch": 1.307976084467625, + "grad_norm": 1.5778156518936157, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8578460812568665, + "num_tokens": 392425183.0, + "step": 10282 + }, + { + "epoch": 1.3081032947462155, + "grad_norm": 1.552233099937439, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8571941256523132, + "num_tokens": 392465766.0, + "step": 10283 + }, + { + "epoch": 1.308230505024806, + "grad_norm": 1.745806336402893, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8589916825294495, + "num_tokens": 392496692.0, + "step": 10284 + }, + { + "epoch": 1.3083577153033965, + "grad_norm": 1.6512576341629028, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8776155710220337, + "num_tokens": 392529365.0, + "step": 10285 + }, + { + "epoch": 1.308484925581987, + "grad_norm": 1.576675534248352, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8572558164596558, + "num_tokens": 392569651.0, + "step": 10286 + }, + { + "epoch": 1.3086121358605776, + "grad_norm": 1.4723519086837769, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8704067468643188, + "num_tokens": 392609991.0, + "step": 10287 + }, + { + "epoch": 1.308739346139168, + "grad_norm": 1.661919355392456, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8755632638931274, + "num_tokens": 392644199.0, + "step": 10288 + }, + { + "epoch": 1.3088665564177586, + "grad_norm": 1.5751045942306519, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8714265823364258, + "num_tokens": 392678508.0, + "step": 10289 + }, + { + "epoch": 1.3089937666963491, + "grad_norm": 1.5086971521377563, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8655564785003662, + "num_tokens": 392724073.0, + "step": 10290 + }, + { + "epoch": 1.3091209769749397, + "grad_norm": 1.6191704273223877, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8802029490470886, + "num_tokens": 392755274.0, + "step": 10291 + }, + { + "epoch": 1.3092481872535302, + "grad_norm": 1.5616018772125244, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8703243732452393, + "num_tokens": 392793828.0, + "step": 10292 + }, + { + "epoch": 1.3093753975321207, + "grad_norm": 1.5860569477081299, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8632242679595947, + "num_tokens": 392827913.0, + "step": 10293 + }, + { + "epoch": 1.309502607810711, + "grad_norm": 1.502409815788269, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8756405115127563, + "num_tokens": 392866045.0, + "step": 10294 + }, + { + "epoch": 1.3096298180893016, + "grad_norm": 1.5166761875152588, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8787943124771118, + "num_tokens": 392902881.0, + "step": 10295 + }, + { + "epoch": 1.309757028367892, + "grad_norm": 1.4586502313613892, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8752244114875793, + "num_tokens": 392942948.0, + "step": 10296 + }, + { + "epoch": 1.3098842386464826, + "grad_norm": 1.7523423433303833, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8529344201087952, + "num_tokens": 392977463.0, + "step": 10297 + }, + { + "epoch": 1.3100114489250732, + "grad_norm": 1.6021192073822021, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8692265748977661, + "num_tokens": 393014686.0, + "step": 10298 + }, + { + "epoch": 1.3101386592036637, + "grad_norm": 1.5639413595199585, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8703136444091797, + "num_tokens": 393052236.0, + "step": 10299 + }, + { + "epoch": 1.3102658694822542, + "grad_norm": 1.5959032773971558, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8825857639312744, + "num_tokens": 393087367.0, + "step": 10300 + }, + { + "epoch": 1.3103930797608447, + "grad_norm": 1.5070335865020752, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8685077428817749, + "num_tokens": 393130039.0, + "step": 10301 + }, + { + "epoch": 1.3105202900394353, + "grad_norm": 1.5192056894302368, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.884493350982666, + "num_tokens": 393165736.0, + "step": 10302 + }, + { + "epoch": 1.3106475003180256, + "grad_norm": 1.4446574449539185, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.879160463809967, + "num_tokens": 393207650.0, + "step": 10303 + }, + { + "epoch": 1.310774710596616, + "grad_norm": 1.469411849975586, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8889340162277222, + "num_tokens": 393243874.0, + "step": 10304 + }, + { + "epoch": 1.3109019208752066, + "grad_norm": 1.4661478996276855, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8775079250335693, + "num_tokens": 393286119.0, + "step": 10305 + }, + { + "epoch": 1.3110291311537972, + "grad_norm": 1.550652027130127, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8787916898727417, + "num_tokens": 393320213.0, + "step": 10306 + }, + { + "epoch": 1.3111563414323877, + "grad_norm": 1.547284722328186, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8720810413360596, + "num_tokens": 393358001.0, + "step": 10307 + }, + { + "epoch": 1.3112835517109782, + "grad_norm": 1.4693118333816528, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8658567070960999, + "num_tokens": 393399878.0, + "step": 10308 + }, + { + "epoch": 1.3114107619895687, + "grad_norm": 1.4102336168289185, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8808705806732178, + "num_tokens": 393444675.0, + "step": 10309 + }, + { + "epoch": 1.3115379722681593, + "grad_norm": 1.352020263671875, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8804031610488892, + "num_tokens": 393488883.0, + "step": 10310 + }, + { + "epoch": 1.3116651825467498, + "grad_norm": 1.5837743282318115, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.869627058506012, + "num_tokens": 393527841.0, + "step": 10311 + }, + { + "epoch": 1.3117923928253403, + "grad_norm": 1.6710803508758545, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.869928240776062, + "num_tokens": 393564160.0, + "step": 10312 + }, + { + "epoch": 1.3119196031039309, + "grad_norm": 1.547874927520752, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.866205096244812, + "num_tokens": 393604687.0, + "step": 10313 + }, + { + "epoch": 1.3120468133825214, + "grad_norm": 1.5775060653686523, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8768413066864014, + "num_tokens": 393640529.0, + "step": 10314 + }, + { + "epoch": 1.312174023661112, + "grad_norm": 1.5561094284057617, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.867244303226471, + "num_tokens": 393680663.0, + "step": 10315 + }, + { + "epoch": 1.3123012339397024, + "grad_norm": 1.612776756286621, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8671858906745911, + "num_tokens": 393714830.0, + "step": 10316 + }, + { + "epoch": 1.312428444218293, + "grad_norm": 1.4576009511947632, + "learning_rate": 1e-06, + "loss": 0.2827, + "mean_token_accuracy": 0.9025269746780396, + "num_tokens": 393754302.0, + "step": 10317 + }, + { + "epoch": 1.3125556544968833, + "grad_norm": 1.4800610542297363, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8761101961135864, + "num_tokens": 393797066.0, + "step": 10318 + }, + { + "epoch": 1.3126828647754738, + "grad_norm": 1.694064974784851, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8790844678878784, + "num_tokens": 393836293.0, + "step": 10319 + }, + { + "epoch": 1.3128100750540643, + "grad_norm": 1.4573777914047241, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.876628041267395, + "num_tokens": 393880457.0, + "step": 10320 + }, + { + "epoch": 1.3129372853326549, + "grad_norm": 1.554969072341919, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8756281137466431, + "num_tokens": 393923413.0, + "step": 10321 + }, + { + "epoch": 1.3130644956112454, + "grad_norm": 1.5591609477996826, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8780570030212402, + "num_tokens": 393959222.0, + "step": 10322 + }, + { + "epoch": 1.313191705889836, + "grad_norm": 1.5105597972869873, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8738992214202881, + "num_tokens": 393997477.0, + "step": 10323 + }, + { + "epoch": 1.3133189161684264, + "grad_norm": 1.5506057739257812, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8800281882286072, + "num_tokens": 394030807.0, + "step": 10324 + }, + { + "epoch": 1.313446126447017, + "grad_norm": 1.749322772026062, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8759182691574097, + "num_tokens": 394062933.0, + "step": 10325 + }, + { + "epoch": 1.3135733367256075, + "grad_norm": 1.5343722105026245, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8683424592018127, + "num_tokens": 394106469.0, + "step": 10326 + }, + { + "epoch": 1.3137005470041978, + "grad_norm": 1.6604764461517334, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8589625954627991, + "num_tokens": 394144100.0, + "step": 10327 + }, + { + "epoch": 1.3138277572827883, + "grad_norm": 1.788832426071167, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.883284330368042, + "num_tokens": 394178733.0, + "step": 10328 + }, + { + "epoch": 1.3139549675613789, + "grad_norm": 1.507526159286499, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8741934299468994, + "num_tokens": 394220359.0, + "step": 10329 + }, + { + "epoch": 1.3140821778399694, + "grad_norm": 1.4572008848190308, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8794090747833252, + "num_tokens": 394262860.0, + "step": 10330 + }, + { + "epoch": 1.31420938811856, + "grad_norm": 1.485561728477478, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8602920770645142, + "num_tokens": 394307281.0, + "step": 10331 + }, + { + "epoch": 1.3143365983971504, + "grad_norm": 1.431363821029663, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8845977783203125, + "num_tokens": 394345874.0, + "step": 10332 + }, + { + "epoch": 1.314463808675741, + "grad_norm": 1.4087918996810913, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.876279890537262, + "num_tokens": 394388867.0, + "step": 10333 + }, + { + "epoch": 1.3145910189543315, + "grad_norm": 1.4933886528015137, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.881169319152832, + "num_tokens": 394427436.0, + "step": 10334 + }, + { + "epoch": 1.314718229232922, + "grad_norm": 1.5780881643295288, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8783763647079468, + "num_tokens": 394464258.0, + "step": 10335 + }, + { + "epoch": 1.3148454395115126, + "grad_norm": 1.5411550998687744, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8766806125640869, + "num_tokens": 394500266.0, + "step": 10336 + }, + { + "epoch": 1.314972649790103, + "grad_norm": 1.3517606258392334, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8719130754470825, + "num_tokens": 394548082.0, + "step": 10337 + }, + { + "epoch": 1.3150998600686936, + "grad_norm": 1.408075213432312, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8745565414428711, + "num_tokens": 394593570.0, + "step": 10338 + }, + { + "epoch": 1.3152270703472841, + "grad_norm": 1.5045816898345947, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8744726181030273, + "num_tokens": 394634821.0, + "step": 10339 + }, + { + "epoch": 1.3153542806258747, + "grad_norm": 1.4669510126113892, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8760176301002502, + "num_tokens": 394674600.0, + "step": 10340 + }, + { + "epoch": 1.3154814909044652, + "grad_norm": 1.5149985551834106, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8782598376274109, + "num_tokens": 394712118.0, + "step": 10341 + }, + { + "epoch": 1.3156087011830557, + "grad_norm": 1.771679401397705, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.870970606803894, + "num_tokens": 394750876.0, + "step": 10342 + }, + { + "epoch": 1.315735911461646, + "grad_norm": 1.4715157747268677, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8779605627059937, + "num_tokens": 394791149.0, + "step": 10343 + }, + { + "epoch": 1.3158631217402366, + "grad_norm": 1.543704867362976, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8830890655517578, + "num_tokens": 394827694.0, + "step": 10344 + }, + { + "epoch": 1.315990332018827, + "grad_norm": 1.4694738388061523, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8713085651397705, + "num_tokens": 394867598.0, + "step": 10345 + }, + { + "epoch": 1.3161175422974176, + "grad_norm": 1.5550669431686401, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8883687853813171, + "num_tokens": 394899453.0, + "step": 10346 + }, + { + "epoch": 1.3162447525760081, + "grad_norm": 1.5415664911270142, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8727532625198364, + "num_tokens": 394941343.0, + "step": 10347 + }, + { + "epoch": 1.3163719628545987, + "grad_norm": 1.5851044654846191, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8714326620101929, + "num_tokens": 394981955.0, + "step": 10348 + }, + { + "epoch": 1.3164991731331892, + "grad_norm": 1.600675344467163, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8658157587051392, + "num_tokens": 395019012.0, + "step": 10349 + }, + { + "epoch": 1.3166263834117797, + "grad_norm": 1.6345051527023315, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8683066368103027, + "num_tokens": 395051768.0, + "step": 10350 + }, + { + "epoch": 1.3167535936903703, + "grad_norm": 1.48458993434906, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8727654218673706, + "num_tokens": 395092475.0, + "step": 10351 + }, + { + "epoch": 1.3168808039689606, + "grad_norm": 1.5670127868652344, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8731451034545898, + "num_tokens": 395128787.0, + "step": 10352 + }, + { + "epoch": 1.317008014247551, + "grad_norm": 1.5968800783157349, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8700621128082275, + "num_tokens": 395166002.0, + "step": 10353 + }, + { + "epoch": 1.3171352245261416, + "grad_norm": 1.6574004888534546, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8658372163772583, + "num_tokens": 395202690.0, + "step": 10354 + }, + { + "epoch": 1.3172624348047322, + "grad_norm": 1.468339204788208, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8788843154907227, + "num_tokens": 395242196.0, + "step": 10355 + }, + { + "epoch": 1.3173896450833227, + "grad_norm": 1.5843794345855713, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8705130815505981, + "num_tokens": 395278677.0, + "step": 10356 + }, + { + "epoch": 1.3175168553619132, + "grad_norm": 1.4473506212234497, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8948566913604736, + "num_tokens": 395315229.0, + "step": 10357 + }, + { + "epoch": 1.3176440656405037, + "grad_norm": 1.483750581741333, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8798453211784363, + "num_tokens": 395355771.0, + "step": 10358 + }, + { + "epoch": 1.3177712759190943, + "grad_norm": 1.5664578676223755, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8773501515388489, + "num_tokens": 395390842.0, + "step": 10359 + }, + { + "epoch": 1.3178984861976848, + "grad_norm": 1.6158350706100464, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8888186812400818, + "num_tokens": 395425698.0, + "step": 10360 + }, + { + "epoch": 1.3180256964762753, + "grad_norm": 1.567461609840393, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8646582365036011, + "num_tokens": 395462171.0, + "step": 10361 + }, + { + "epoch": 1.3181529067548658, + "grad_norm": 1.566996693611145, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8757805824279785, + "num_tokens": 395500232.0, + "step": 10362 + }, + { + "epoch": 1.3182801170334564, + "grad_norm": 1.5200715065002441, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8727799654006958, + "num_tokens": 395539079.0, + "step": 10363 + }, + { + "epoch": 1.318407327312047, + "grad_norm": 1.568745732307434, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8765739798545837, + "num_tokens": 395573273.0, + "step": 10364 + }, + { + "epoch": 1.3185345375906374, + "grad_norm": 1.6957260370254517, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8702419996261597, + "num_tokens": 395605100.0, + "step": 10365 + }, + { + "epoch": 1.318661747869228, + "grad_norm": 1.5203392505645752, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8576364517211914, + "num_tokens": 395648374.0, + "step": 10366 + }, + { + "epoch": 1.3187889581478183, + "grad_norm": 1.4846246242523193, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8670606017112732, + "num_tokens": 395691841.0, + "step": 10367 + }, + { + "epoch": 1.3189161684264088, + "grad_norm": 1.6125974655151367, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.86405348777771, + "num_tokens": 395727895.0, + "step": 10368 + }, + { + "epoch": 1.3190433787049993, + "grad_norm": 1.5278962850570679, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8748392462730408, + "num_tokens": 395766467.0, + "step": 10369 + }, + { + "epoch": 1.3191705889835899, + "grad_norm": 1.5144871473312378, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8819267749786377, + "num_tokens": 395806163.0, + "step": 10370 + }, + { + "epoch": 1.3192977992621804, + "grad_norm": 1.5183767080307007, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8757553100585938, + "num_tokens": 395844526.0, + "step": 10371 + }, + { + "epoch": 1.319425009540771, + "grad_norm": 1.4894400835037231, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8911910057067871, + "num_tokens": 395881291.0, + "step": 10372 + }, + { + "epoch": 1.3195522198193614, + "grad_norm": 1.4125133752822876, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8746932148933411, + "num_tokens": 395926086.0, + "step": 10373 + }, + { + "epoch": 1.319679430097952, + "grad_norm": 1.5121800899505615, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8904675841331482, + "num_tokens": 395961184.0, + "step": 10374 + }, + { + "epoch": 1.3198066403765425, + "grad_norm": 1.412986159324646, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8895974159240723, + "num_tokens": 396001796.0, + "step": 10375 + }, + { + "epoch": 1.3199338506551328, + "grad_norm": 1.4729548692703247, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8828331828117371, + "num_tokens": 396042618.0, + "step": 10376 + }, + { + "epoch": 1.3200610609337233, + "grad_norm": 1.5724539756774902, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8637693524360657, + "num_tokens": 396081263.0, + "step": 10377 + }, + { + "epoch": 1.3201882712123139, + "grad_norm": 1.5162205696105957, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8683042526245117, + "num_tokens": 396119325.0, + "step": 10378 + }, + { + "epoch": 1.3203154814909044, + "grad_norm": 1.7810273170471191, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8773228526115417, + "num_tokens": 396148578.0, + "step": 10379 + }, + { + "epoch": 1.320442691769495, + "grad_norm": 1.7071584463119507, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.872732400894165, + "num_tokens": 396185851.0, + "step": 10380 + }, + { + "epoch": 1.3205699020480854, + "grad_norm": 1.4780062437057495, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8776683807373047, + "num_tokens": 396229458.0, + "step": 10381 + }, + { + "epoch": 1.320697112326676, + "grad_norm": 1.6397022008895874, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8700978755950928, + "num_tokens": 396265007.0, + "step": 10382 + }, + { + "epoch": 1.3208243226052665, + "grad_norm": 1.3498449325561523, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8908455967903137, + "num_tokens": 396304456.0, + "step": 10383 + }, + { + "epoch": 1.320951532883857, + "grad_norm": 1.5023674964904785, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8647618293762207, + "num_tokens": 396346521.0, + "step": 10384 + }, + { + "epoch": 1.3210787431624476, + "grad_norm": 1.5333274602890015, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.872280478477478, + "num_tokens": 396383130.0, + "step": 10385 + }, + { + "epoch": 1.321205953441038, + "grad_norm": 1.5117113590240479, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8761646747589111, + "num_tokens": 396418858.0, + "step": 10386 + }, + { + "epoch": 1.3213331637196286, + "grad_norm": 1.5129878520965576, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8661948442459106, + "num_tokens": 396458568.0, + "step": 10387 + }, + { + "epoch": 1.3214603739982191, + "grad_norm": 1.4932827949523926, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8932257890701294, + "num_tokens": 396493868.0, + "step": 10388 + }, + { + "epoch": 1.3215875842768097, + "grad_norm": 1.6023976802825928, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8618696331977844, + "num_tokens": 396530172.0, + "step": 10389 + }, + { + "epoch": 1.3217147945554002, + "grad_norm": 1.5063759088516235, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8775339722633362, + "num_tokens": 396566946.0, + "step": 10390 + }, + { + "epoch": 1.3218420048339907, + "grad_norm": 1.6006594896316528, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8564653992652893, + "num_tokens": 396601644.0, + "step": 10391 + }, + { + "epoch": 1.321969215112581, + "grad_norm": 1.6843390464782715, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8669090270996094, + "num_tokens": 396637538.0, + "step": 10392 + }, + { + "epoch": 1.3220964253911716, + "grad_norm": 1.3736790418624878, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8854818940162659, + "num_tokens": 396680721.0, + "step": 10393 + }, + { + "epoch": 1.322223635669762, + "grad_norm": 1.3619978427886963, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8906694054603577, + "num_tokens": 396723769.0, + "step": 10394 + }, + { + "epoch": 1.3223508459483526, + "grad_norm": 1.6543761491775513, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8614821434020996, + "num_tokens": 396757772.0, + "step": 10395 + }, + { + "epoch": 1.3224780562269431, + "grad_norm": 1.4736977815628052, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8814648389816284, + "num_tokens": 396800095.0, + "step": 10396 + }, + { + "epoch": 1.3226052665055337, + "grad_norm": 1.4556628465652466, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8685541152954102, + "num_tokens": 396841281.0, + "step": 10397 + }, + { + "epoch": 1.3227324767841242, + "grad_norm": 1.5216095447540283, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8650224208831787, + "num_tokens": 396880980.0, + "step": 10398 + }, + { + "epoch": 1.3228596870627147, + "grad_norm": 1.4503273963928223, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8877658843994141, + "num_tokens": 396919773.0, + "step": 10399 + }, + { + "epoch": 1.3229868973413053, + "grad_norm": 1.557058572769165, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8735065460205078, + "num_tokens": 396957946.0, + "step": 10400 + }, + { + "epoch": 1.3231141076198956, + "grad_norm": 1.3869736194610596, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8772619962692261, + "num_tokens": 396999937.0, + "step": 10401 + }, + { + "epoch": 1.323241317898486, + "grad_norm": 1.5435161590576172, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8627699613571167, + "num_tokens": 397036339.0, + "step": 10402 + }, + { + "epoch": 1.3233685281770766, + "grad_norm": 1.4804117679595947, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8627676963806152, + "num_tokens": 397078874.0, + "step": 10403 + }, + { + "epoch": 1.3234957384556671, + "grad_norm": 1.5215431451797485, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8809070587158203, + "num_tokens": 397114568.0, + "step": 10404 + }, + { + "epoch": 1.3236229487342577, + "grad_norm": 1.4316036701202393, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8884320259094238, + "num_tokens": 397154978.0, + "step": 10405 + }, + { + "epoch": 1.3237501590128482, + "grad_norm": 1.5322411060333252, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8720659017562866, + "num_tokens": 397193793.0, + "step": 10406 + }, + { + "epoch": 1.3238773692914387, + "grad_norm": 1.438127040863037, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8723477721214294, + "num_tokens": 397237469.0, + "step": 10407 + }, + { + "epoch": 1.3240045795700293, + "grad_norm": 1.6606777906417847, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8478000164031982, + "num_tokens": 397275726.0, + "step": 10408 + }, + { + "epoch": 1.3241317898486198, + "grad_norm": 1.4319791793823242, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8839294910430908, + "num_tokens": 397313902.0, + "step": 10409 + }, + { + "epoch": 1.3242590001272103, + "grad_norm": 1.57612144947052, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8768578767776489, + "num_tokens": 397354250.0, + "step": 10410 + }, + { + "epoch": 1.3243862104058008, + "grad_norm": 1.4324158430099487, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8881703615188599, + "num_tokens": 397393173.0, + "step": 10411 + }, + { + "epoch": 1.3245134206843914, + "grad_norm": 1.488269329071045, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8910291790962219, + "num_tokens": 397426308.0, + "step": 10412 + }, + { + "epoch": 1.324640630962982, + "grad_norm": 1.435258150100708, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8619614243507385, + "num_tokens": 397471548.0, + "step": 10413 + }, + { + "epoch": 1.3247678412415724, + "grad_norm": 1.4741917848587036, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.874416708946228, + "num_tokens": 397515236.0, + "step": 10414 + }, + { + "epoch": 1.324895051520163, + "grad_norm": 1.560643196105957, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8721052408218384, + "num_tokens": 397554129.0, + "step": 10415 + }, + { + "epoch": 1.3250222617987533, + "grad_norm": 1.6195694208145142, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8597670793533325, + "num_tokens": 397591323.0, + "step": 10416 + }, + { + "epoch": 1.3251494720773438, + "grad_norm": 1.5165966749191284, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8706310987472534, + "num_tokens": 397629280.0, + "step": 10417 + }, + { + "epoch": 1.3252766823559343, + "grad_norm": 1.5455197095870972, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8797135353088379, + "num_tokens": 397663251.0, + "step": 10418 + }, + { + "epoch": 1.3254038926345248, + "grad_norm": 1.5830656290054321, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8787767887115479, + "num_tokens": 397696332.0, + "step": 10419 + }, + { + "epoch": 1.3255311029131154, + "grad_norm": 1.4356547594070435, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8878689408302307, + "num_tokens": 397735948.0, + "step": 10420 + }, + { + "epoch": 1.325658313191706, + "grad_norm": 1.5797885656356812, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8664958477020264, + "num_tokens": 397771218.0, + "step": 10421 + }, + { + "epoch": 1.3257855234702964, + "grad_norm": 1.4900951385498047, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8610944747924805, + "num_tokens": 397813210.0, + "step": 10422 + }, + { + "epoch": 1.325912733748887, + "grad_norm": 1.3826513290405273, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8872225284576416, + "num_tokens": 397853683.0, + "step": 10423 + }, + { + "epoch": 1.3260399440274775, + "grad_norm": 1.5740970373153687, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8664312362670898, + "num_tokens": 397888060.0, + "step": 10424 + }, + { + "epoch": 1.3261671543060678, + "grad_norm": 1.5239237546920776, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8614324331283569, + "num_tokens": 397926727.0, + "step": 10425 + }, + { + "epoch": 1.3262943645846583, + "grad_norm": 1.457043170928955, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8682738542556763, + "num_tokens": 397967877.0, + "step": 10426 + }, + { + "epoch": 1.3264215748632489, + "grad_norm": 1.4134907722473145, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8820595145225525, + "num_tokens": 398011763.0, + "step": 10427 + }, + { + "epoch": 1.3265487851418394, + "grad_norm": 1.4515036344528198, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8784517645835876, + "num_tokens": 398049800.0, + "step": 10428 + }, + { + "epoch": 1.32667599542043, + "grad_norm": 1.530259370803833, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8531268835067749, + "num_tokens": 398095243.0, + "step": 10429 + }, + { + "epoch": 1.3268032056990204, + "grad_norm": 1.452287197113037, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8904839158058167, + "num_tokens": 398129761.0, + "step": 10430 + }, + { + "epoch": 1.326930415977611, + "grad_norm": 1.4706588983535767, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8866547346115112, + "num_tokens": 398166122.0, + "step": 10431 + }, + { + "epoch": 1.3270576262562015, + "grad_norm": 1.5525681972503662, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8693950176239014, + "num_tokens": 398203465.0, + "step": 10432 + }, + { + "epoch": 1.327184836534792, + "grad_norm": 1.6232633590698242, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8733991980552673, + "num_tokens": 398234988.0, + "step": 10433 + }, + { + "epoch": 1.3273120468133826, + "grad_norm": 1.494470477104187, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8839477300643921, + "num_tokens": 398269594.0, + "step": 10434 + }, + { + "epoch": 1.327439257091973, + "grad_norm": 1.5977362394332886, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8766495585441589, + "num_tokens": 398305162.0, + "step": 10435 + }, + { + "epoch": 1.3275664673705636, + "grad_norm": 1.556156873703003, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8710632920265198, + "num_tokens": 398342151.0, + "step": 10436 + }, + { + "epoch": 1.3276936776491541, + "grad_norm": 1.4933323860168457, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8680713176727295, + "num_tokens": 398382428.0, + "step": 10437 + }, + { + "epoch": 1.3278208879277447, + "grad_norm": 1.5203192234039307, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8765900135040283, + "num_tokens": 398419289.0, + "step": 10438 + }, + { + "epoch": 1.3279480982063352, + "grad_norm": 1.5597615242004395, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8707287311553955, + "num_tokens": 398461415.0, + "step": 10439 + }, + { + "epoch": 1.3280753084849257, + "grad_norm": 1.4804426431655884, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8908538818359375, + "num_tokens": 398499620.0, + "step": 10440 + }, + { + "epoch": 1.328202518763516, + "grad_norm": 1.7350071668624878, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8740745186805725, + "num_tokens": 398529225.0, + "step": 10441 + }, + { + "epoch": 1.3283297290421066, + "grad_norm": 1.447811484336853, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8710407018661499, + "num_tokens": 398574242.0, + "step": 10442 + }, + { + "epoch": 1.328456939320697, + "grad_norm": 1.6219576597213745, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8591260313987732, + "num_tokens": 398614514.0, + "step": 10443 + }, + { + "epoch": 1.3285841495992876, + "grad_norm": 1.5125173330307007, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8743807673454285, + "num_tokens": 398653087.0, + "step": 10444 + }, + { + "epoch": 1.3287113598778781, + "grad_norm": 1.4522597789764404, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8949792385101318, + "num_tokens": 398690781.0, + "step": 10445 + }, + { + "epoch": 1.3288385701564687, + "grad_norm": 1.5766141414642334, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.873975396156311, + "num_tokens": 398727229.0, + "step": 10446 + }, + { + "epoch": 1.3289657804350592, + "grad_norm": 1.572139024734497, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8763419389724731, + "num_tokens": 398760992.0, + "step": 10447 + }, + { + "epoch": 1.3290929907136497, + "grad_norm": 1.3774971961975098, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8834514617919922, + "num_tokens": 398802452.0, + "step": 10448 + }, + { + "epoch": 1.3292202009922403, + "grad_norm": 1.489685297012329, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8782988786697388, + "num_tokens": 398838717.0, + "step": 10449 + }, + { + "epoch": 1.3293474112708306, + "grad_norm": 1.4884133338928223, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.858368992805481, + "num_tokens": 398881919.0, + "step": 10450 + }, + { + "epoch": 1.329474621549421, + "grad_norm": 1.5923261642456055, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8566367626190186, + "num_tokens": 398919066.0, + "step": 10451 + }, + { + "epoch": 1.3296018318280116, + "grad_norm": 1.621109962463379, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8858180642127991, + "num_tokens": 398950324.0, + "step": 10452 + }, + { + "epoch": 1.3297290421066021, + "grad_norm": 1.4692950248718262, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8822876214981079, + "num_tokens": 398991228.0, + "step": 10453 + }, + { + "epoch": 1.3298562523851927, + "grad_norm": 1.470354676246643, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8774333000183105, + "num_tokens": 399030413.0, + "step": 10454 + }, + { + "epoch": 1.3299834626637832, + "grad_norm": 1.5177627801895142, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.875640869140625, + "num_tokens": 399067282.0, + "step": 10455 + }, + { + "epoch": 1.3301106729423737, + "grad_norm": 1.5315663814544678, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8697509765625, + "num_tokens": 399104276.0, + "step": 10456 + }, + { + "epoch": 1.3302378832209643, + "grad_norm": 1.5821666717529297, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8764044046401978, + "num_tokens": 399137809.0, + "step": 10457 + }, + { + "epoch": 1.3303650934995548, + "grad_norm": 1.5645781755447388, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8563156127929688, + "num_tokens": 399176041.0, + "step": 10458 + }, + { + "epoch": 1.3304923037781453, + "grad_norm": 1.5089653730392456, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8795106410980225, + "num_tokens": 399215305.0, + "step": 10459 + }, + { + "epoch": 1.3306195140567358, + "grad_norm": 1.776994228363037, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8650593161582947, + "num_tokens": 399248550.0, + "step": 10460 + }, + { + "epoch": 1.3307467243353264, + "grad_norm": 1.5583186149597168, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8653274178504944, + "num_tokens": 399284932.0, + "step": 10461 + }, + { + "epoch": 1.330873934613917, + "grad_norm": 1.7540467977523804, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8908843398094177, + "num_tokens": 399314605.0, + "step": 10462 + }, + { + "epoch": 1.3310011448925074, + "grad_norm": 1.4804816246032715, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8891428112983704, + "num_tokens": 399351794.0, + "step": 10463 + }, + { + "epoch": 1.331128355171098, + "grad_norm": 1.5819038152694702, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8625919222831726, + "num_tokens": 399389765.0, + "step": 10464 + }, + { + "epoch": 1.3312555654496883, + "grad_norm": 1.397962212562561, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8827694654464722, + "num_tokens": 399429435.0, + "step": 10465 + }, + { + "epoch": 1.3313827757282788, + "grad_norm": 1.5286626815795898, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.886444091796875, + "num_tokens": 399464056.0, + "step": 10466 + }, + { + "epoch": 1.3315099860068693, + "grad_norm": 1.4009323120117188, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.880361795425415, + "num_tokens": 399504015.0, + "step": 10467 + }, + { + "epoch": 1.3316371962854598, + "grad_norm": 1.5953115224838257, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8603847026824951, + "num_tokens": 399540849.0, + "step": 10468 + }, + { + "epoch": 1.3317644065640504, + "grad_norm": 1.4381717443466187, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8809259533882141, + "num_tokens": 399580175.0, + "step": 10469 + }, + { + "epoch": 1.331891616842641, + "grad_norm": 1.5158981084823608, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8679598569869995, + "num_tokens": 399616160.0, + "step": 10470 + }, + { + "epoch": 1.3320188271212314, + "grad_norm": 1.388832926750183, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8748131990432739, + "num_tokens": 399660071.0, + "step": 10471 + }, + { + "epoch": 1.332146037399822, + "grad_norm": 1.5396459102630615, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.874070405960083, + "num_tokens": 399695593.0, + "step": 10472 + }, + { + "epoch": 1.3322732476784125, + "grad_norm": 1.526094913482666, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8880364298820496, + "num_tokens": 399730001.0, + "step": 10473 + }, + { + "epoch": 1.3324004579570028, + "grad_norm": 1.4711326360702515, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8735290765762329, + "num_tokens": 399769123.0, + "step": 10474 + }, + { + "epoch": 1.3325276682355933, + "grad_norm": 1.4715213775634766, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.849327564239502, + "num_tokens": 399815134.0, + "step": 10475 + }, + { + "epoch": 1.3326548785141838, + "grad_norm": 1.554213285446167, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8775925636291504, + "num_tokens": 399849865.0, + "step": 10476 + }, + { + "epoch": 1.3327820887927744, + "grad_norm": 1.6643412113189697, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8698195219039917, + "num_tokens": 399883825.0, + "step": 10477 + }, + { + "epoch": 1.332909299071365, + "grad_norm": 1.4510084390640259, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8729573488235474, + "num_tokens": 399921156.0, + "step": 10478 + }, + { + "epoch": 1.3330365093499554, + "grad_norm": 1.5046366453170776, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8727957010269165, + "num_tokens": 399963078.0, + "step": 10479 + }, + { + "epoch": 1.333163719628546, + "grad_norm": 1.4832305908203125, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8886876702308655, + "num_tokens": 399999948.0, + "step": 10480 + }, + { + "epoch": 1.3332909299071365, + "grad_norm": 1.5810344219207764, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8580721616744995, + "num_tokens": 400039374.0, + "step": 10481 + }, + { + "epoch": 1.333418140185727, + "grad_norm": 1.5865429639816284, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8493404388427734, + "num_tokens": 400081608.0, + "step": 10482 + }, + { + "epoch": 1.3335453504643175, + "grad_norm": 1.6751724481582642, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8702700138092041, + "num_tokens": 400112925.0, + "step": 10483 + }, + { + "epoch": 1.333672560742908, + "grad_norm": 1.4216387271881104, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8833963871002197, + "num_tokens": 400151353.0, + "step": 10484 + }, + { + "epoch": 1.3337997710214986, + "grad_norm": 1.5338939428329468, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8683897256851196, + "num_tokens": 400189391.0, + "step": 10485 + }, + { + "epoch": 1.3339269813000891, + "grad_norm": 1.4318478107452393, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.881461501121521, + "num_tokens": 400229411.0, + "step": 10486 + }, + { + "epoch": 1.3340541915786797, + "grad_norm": 1.6154481172561646, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8582227230072021, + "num_tokens": 400266267.0, + "step": 10487 + }, + { + "epoch": 1.3341814018572702, + "grad_norm": 1.4773614406585693, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8818138837814331, + "num_tokens": 400304672.0, + "step": 10488 + }, + { + "epoch": 1.3343086121358605, + "grad_norm": 1.6786149740219116, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8730757236480713, + "num_tokens": 400337958.0, + "step": 10489 + }, + { + "epoch": 1.334435822414451, + "grad_norm": 1.4528621435165405, + "learning_rate": 1e-06, + "loss": 0.2786, + "mean_token_accuracy": 0.900081992149353, + "num_tokens": 400373450.0, + "step": 10490 + }, + { + "epoch": 1.3345630326930416, + "grad_norm": 1.57001793384552, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8725992441177368, + "num_tokens": 400407338.0, + "step": 10491 + }, + { + "epoch": 1.334690242971632, + "grad_norm": 1.5157657861709595, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8788338899612427, + "num_tokens": 400441766.0, + "step": 10492 + }, + { + "epoch": 1.3348174532502226, + "grad_norm": 1.6027936935424805, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8683559894561768, + "num_tokens": 400478366.0, + "step": 10493 + }, + { + "epoch": 1.3349446635288131, + "grad_norm": 1.5701451301574707, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8867867588996887, + "num_tokens": 400517412.0, + "step": 10494 + }, + { + "epoch": 1.3350718738074037, + "grad_norm": 1.5203763246536255, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8885629177093506, + "num_tokens": 400553105.0, + "step": 10495 + }, + { + "epoch": 1.3351990840859942, + "grad_norm": 1.601090431213379, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.877311110496521, + "num_tokens": 400588809.0, + "step": 10496 + }, + { + "epoch": 1.3353262943645847, + "grad_norm": 1.4524614810943604, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8776015639305115, + "num_tokens": 400627881.0, + "step": 10497 + }, + { + "epoch": 1.3354535046431752, + "grad_norm": 1.5529180765151978, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8828545212745667, + "num_tokens": 400662047.0, + "step": 10498 + }, + { + "epoch": 1.3355807149217656, + "grad_norm": 1.3993571996688843, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8804284334182739, + "num_tokens": 400707975.0, + "step": 10499 + }, + { + "epoch": 1.335707925200356, + "grad_norm": 1.5240806341171265, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8790693879127502, + "num_tokens": 400746227.0, + "step": 10500 + }, + { + "epoch": 1.3358351354789466, + "grad_norm": 1.481351375579834, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8816359639167786, + "num_tokens": 400786144.0, + "step": 10501 + }, + { + "epoch": 1.3359623457575371, + "grad_norm": 1.4879881143569946, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8793720602989197, + "num_tokens": 400828035.0, + "step": 10502 + }, + { + "epoch": 1.3360895560361277, + "grad_norm": 1.639185905456543, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8750528693199158, + "num_tokens": 400865428.0, + "step": 10503 + }, + { + "epoch": 1.3362167663147182, + "grad_norm": 1.4033656120300293, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8728753328323364, + "num_tokens": 400907801.0, + "step": 10504 + }, + { + "epoch": 1.3363439765933087, + "grad_norm": 1.5262582302093506, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8700013756752014, + "num_tokens": 400945745.0, + "step": 10505 + }, + { + "epoch": 1.3364711868718993, + "grad_norm": 1.610007882118225, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8762709498405457, + "num_tokens": 400980889.0, + "step": 10506 + }, + { + "epoch": 1.3365983971504898, + "grad_norm": 1.4247963428497314, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8815383911132812, + "num_tokens": 401020476.0, + "step": 10507 + }, + { + "epoch": 1.3367256074290803, + "grad_norm": 1.4991739988327026, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8776305913925171, + "num_tokens": 401058030.0, + "step": 10508 + }, + { + "epoch": 1.3368528177076708, + "grad_norm": 1.5817222595214844, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8713587522506714, + "num_tokens": 401093083.0, + "step": 10509 + }, + { + "epoch": 1.3369800279862614, + "grad_norm": 1.5551239252090454, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.872534453868866, + "num_tokens": 401129927.0, + "step": 10510 + }, + { + "epoch": 1.337107238264852, + "grad_norm": 1.5053175687789917, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8748632669448853, + "num_tokens": 401168984.0, + "step": 10511 + }, + { + "epoch": 1.3372344485434424, + "grad_norm": 1.6106393337249756, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8730810284614563, + "num_tokens": 401204772.0, + "step": 10512 + }, + { + "epoch": 1.337361658822033, + "grad_norm": 1.5094574689865112, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8831214308738708, + "num_tokens": 401243160.0, + "step": 10513 + }, + { + "epoch": 1.3374888691006233, + "grad_norm": 1.5506876707077026, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8801500797271729, + "num_tokens": 401282440.0, + "step": 10514 + }, + { + "epoch": 1.3376160793792138, + "grad_norm": 1.564989447593689, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8682215213775635, + "num_tokens": 401317755.0, + "step": 10515 + }, + { + "epoch": 1.3377432896578043, + "grad_norm": 1.5833250284194946, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8536458015441895, + "num_tokens": 401358374.0, + "step": 10516 + }, + { + "epoch": 1.3378704999363948, + "grad_norm": 1.492195963859558, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8669002056121826, + "num_tokens": 401398544.0, + "step": 10517 + }, + { + "epoch": 1.3379977102149854, + "grad_norm": 1.5504109859466553, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8814647197723389, + "num_tokens": 401433434.0, + "step": 10518 + }, + { + "epoch": 1.338124920493576, + "grad_norm": 1.4597458839416504, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8549631237983704, + "num_tokens": 401475484.0, + "step": 10519 + }, + { + "epoch": 1.3382521307721664, + "grad_norm": 1.5335205793380737, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8722225427627563, + "num_tokens": 401513491.0, + "step": 10520 + }, + { + "epoch": 1.338379341050757, + "grad_norm": 1.6639918088912964, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.874549150466919, + "num_tokens": 401546394.0, + "step": 10521 + }, + { + "epoch": 1.3385065513293475, + "grad_norm": 1.4166520833969116, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8860465884208679, + "num_tokens": 401587815.0, + "step": 10522 + }, + { + "epoch": 1.3386337616079378, + "grad_norm": 1.4407416582107544, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.885932445526123, + "num_tokens": 401631282.0, + "step": 10523 + }, + { + "epoch": 1.3387609718865283, + "grad_norm": 1.5364775657653809, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8691664934158325, + "num_tokens": 401672180.0, + "step": 10524 + }, + { + "epoch": 1.3388881821651188, + "grad_norm": 1.6149604320526123, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8706795573234558, + "num_tokens": 401704818.0, + "step": 10525 + }, + { + "epoch": 1.3390153924437094, + "grad_norm": 1.4302759170532227, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8694865703582764, + "num_tokens": 401746956.0, + "step": 10526 + }, + { + "epoch": 1.3391426027223, + "grad_norm": 1.6571412086486816, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8755428194999695, + "num_tokens": 401782578.0, + "step": 10527 + }, + { + "epoch": 1.3392698130008904, + "grad_norm": 1.5786014795303345, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8795764446258545, + "num_tokens": 401817362.0, + "step": 10528 + }, + { + "epoch": 1.339397023279481, + "grad_norm": 1.4607739448547363, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8714312314987183, + "num_tokens": 401858131.0, + "step": 10529 + }, + { + "epoch": 1.3395242335580715, + "grad_norm": 1.5965949296951294, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8429007530212402, + "num_tokens": 401897255.0, + "step": 10530 + }, + { + "epoch": 1.339651443836662, + "grad_norm": 1.50141179561615, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8709937334060669, + "num_tokens": 401938768.0, + "step": 10531 + }, + { + "epoch": 1.3397786541152525, + "grad_norm": 1.4737310409545898, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8823516368865967, + "num_tokens": 401979887.0, + "step": 10532 + }, + { + "epoch": 1.339905864393843, + "grad_norm": 1.5278494358062744, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8718502521514893, + "num_tokens": 402018526.0, + "step": 10533 + }, + { + "epoch": 1.3400330746724336, + "grad_norm": 1.570004940032959, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8648658394813538, + "num_tokens": 402056304.0, + "step": 10534 + }, + { + "epoch": 1.3401602849510241, + "grad_norm": 1.5676690340042114, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8765177726745605, + "num_tokens": 402094487.0, + "step": 10535 + }, + { + "epoch": 1.3402874952296147, + "grad_norm": 1.5383834838867188, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8882603049278259, + "num_tokens": 402128222.0, + "step": 10536 + }, + { + "epoch": 1.3404147055082052, + "grad_norm": 1.4017882347106934, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.887122631072998, + "num_tokens": 402169468.0, + "step": 10537 + }, + { + "epoch": 1.3405419157867955, + "grad_norm": 1.399954080581665, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8732051253318787, + "num_tokens": 402213736.0, + "step": 10538 + }, + { + "epoch": 1.340669126065386, + "grad_norm": 1.6979697942733765, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8646721839904785, + "num_tokens": 402246931.0, + "step": 10539 + }, + { + "epoch": 1.3407963363439765, + "grad_norm": 1.3572745323181152, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8880264163017273, + "num_tokens": 402290712.0, + "step": 10540 + }, + { + "epoch": 1.340923546622567, + "grad_norm": 1.4894922971725464, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8809665441513062, + "num_tokens": 402329933.0, + "step": 10541 + }, + { + "epoch": 1.3410507569011576, + "grad_norm": 1.811207890510559, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8692810535430908, + "num_tokens": 402360216.0, + "step": 10542 + }, + { + "epoch": 1.3411779671797481, + "grad_norm": 1.6323418617248535, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8818025588989258, + "num_tokens": 402391704.0, + "step": 10543 + }, + { + "epoch": 1.3413051774583387, + "grad_norm": 1.737325668334961, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8773240447044373, + "num_tokens": 402426456.0, + "step": 10544 + }, + { + "epoch": 1.3414323877369292, + "grad_norm": 1.5278890132904053, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.889236569404602, + "num_tokens": 402462289.0, + "step": 10545 + }, + { + "epoch": 1.3415595980155197, + "grad_norm": 1.6233654022216797, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.86320960521698, + "num_tokens": 402499670.0, + "step": 10546 + }, + { + "epoch": 1.3416868082941102, + "grad_norm": 1.4888418912887573, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8805866241455078, + "num_tokens": 402537687.0, + "step": 10547 + }, + { + "epoch": 1.3418140185727006, + "grad_norm": 1.6189851760864258, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.871314287185669, + "num_tokens": 402569222.0, + "step": 10548 + }, + { + "epoch": 1.341941228851291, + "grad_norm": 1.8120805025100708, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8858762383460999, + "num_tokens": 402607079.0, + "step": 10549 + }, + { + "epoch": 1.3420684391298816, + "grad_norm": 1.6399606466293335, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8779231309890747, + "num_tokens": 402641750.0, + "step": 10550 + }, + { + "epoch": 1.3421956494084721, + "grad_norm": 1.4871416091918945, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8707908987998962, + "num_tokens": 402684952.0, + "step": 10551 + }, + { + "epoch": 1.3423228596870627, + "grad_norm": 1.5200194120407104, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8695630431175232, + "num_tokens": 402724961.0, + "step": 10552 + }, + { + "epoch": 1.3424500699656532, + "grad_norm": 1.452587366104126, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8755209445953369, + "num_tokens": 402763375.0, + "step": 10553 + }, + { + "epoch": 1.3425772802442437, + "grad_norm": 1.4495428800582886, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8689633011817932, + "num_tokens": 402807287.0, + "step": 10554 + }, + { + "epoch": 1.3427044905228342, + "grad_norm": 1.3872435092926025, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8842827081680298, + "num_tokens": 402849065.0, + "step": 10555 + }, + { + "epoch": 1.3428317008014248, + "grad_norm": 1.4859554767608643, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8819560408592224, + "num_tokens": 402887120.0, + "step": 10556 + }, + { + "epoch": 1.3429589110800153, + "grad_norm": 1.5206482410430908, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8811092376708984, + "num_tokens": 402923653.0, + "step": 10557 + }, + { + "epoch": 1.3430861213586058, + "grad_norm": 1.4428871870040894, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8720822334289551, + "num_tokens": 402964834.0, + "step": 10558 + }, + { + "epoch": 1.3432133316371964, + "grad_norm": 1.4507317543029785, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8840290307998657, + "num_tokens": 403003206.0, + "step": 10559 + }, + { + "epoch": 1.343340541915787, + "grad_norm": 1.5223033428192139, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8687148094177246, + "num_tokens": 403044200.0, + "step": 10560 + }, + { + "epoch": 1.3434677521943774, + "grad_norm": 1.4956415891647339, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8680347204208374, + "num_tokens": 403083580.0, + "step": 10561 + }, + { + "epoch": 1.343594962472968, + "grad_norm": 1.449403166770935, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.867958664894104, + "num_tokens": 403127417.0, + "step": 10562 + }, + { + "epoch": 1.3437221727515583, + "grad_norm": 1.4789725542068481, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8644793629646301, + "num_tokens": 403168563.0, + "step": 10563 + }, + { + "epoch": 1.3438493830301488, + "grad_norm": 1.615484356880188, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8709767460823059, + "num_tokens": 403201964.0, + "step": 10564 + }, + { + "epoch": 1.3439765933087393, + "grad_norm": 1.5807453393936157, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8710138201713562, + "num_tokens": 403237786.0, + "step": 10565 + }, + { + "epoch": 1.3441038035873298, + "grad_norm": 1.6228703260421753, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8578757047653198, + "num_tokens": 403272583.0, + "step": 10566 + }, + { + "epoch": 1.3442310138659204, + "grad_norm": 1.539353609085083, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8763279318809509, + "num_tokens": 403307056.0, + "step": 10567 + }, + { + "epoch": 1.344358224144511, + "grad_norm": 1.5318379402160645, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8883773684501648, + "num_tokens": 403343613.0, + "step": 10568 + }, + { + "epoch": 1.3444854344231014, + "grad_norm": 1.5117285251617432, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8723807334899902, + "num_tokens": 403380449.0, + "step": 10569 + }, + { + "epoch": 1.344612644701692, + "grad_norm": 1.419107437133789, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8732060790061951, + "num_tokens": 403421229.0, + "step": 10570 + }, + { + "epoch": 1.3447398549802825, + "grad_norm": 1.4916280508041382, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8858535289764404, + "num_tokens": 403454951.0, + "step": 10571 + }, + { + "epoch": 1.3448670652588728, + "grad_norm": 1.4692491292953491, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8843233585357666, + "num_tokens": 403492614.0, + "step": 10572 + }, + { + "epoch": 1.3449942755374633, + "grad_norm": 1.49423086643219, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8843992948532104, + "num_tokens": 403528974.0, + "step": 10573 + }, + { + "epoch": 1.3451214858160538, + "grad_norm": 1.4855202436447144, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8803907632827759, + "num_tokens": 403566113.0, + "step": 10574 + }, + { + "epoch": 1.3452486960946444, + "grad_norm": 1.4547359943389893, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.876863420009613, + "num_tokens": 403606414.0, + "step": 10575 + }, + { + "epoch": 1.345375906373235, + "grad_norm": 1.4756293296813965, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8874359726905823, + "num_tokens": 403644519.0, + "step": 10576 + }, + { + "epoch": 1.3455031166518254, + "grad_norm": 1.5493617057800293, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8859004974365234, + "num_tokens": 403685608.0, + "step": 10577 + }, + { + "epoch": 1.345630326930416, + "grad_norm": 1.459633469581604, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8816348910331726, + "num_tokens": 403726297.0, + "step": 10578 + }, + { + "epoch": 1.3457575372090065, + "grad_norm": 1.5870195627212524, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8697289228439331, + "num_tokens": 403762814.0, + "step": 10579 + }, + { + "epoch": 1.345884747487597, + "grad_norm": 1.6487165689468384, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8902608752250671, + "num_tokens": 403794139.0, + "step": 10580 + }, + { + "epoch": 1.3460119577661875, + "grad_norm": 1.4751983880996704, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8810518980026245, + "num_tokens": 403835533.0, + "step": 10581 + }, + { + "epoch": 1.346139168044778, + "grad_norm": 1.6062562465667725, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8748486638069153, + "num_tokens": 403869146.0, + "step": 10582 + }, + { + "epoch": 1.3462663783233686, + "grad_norm": 1.4468026161193848, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8753221035003662, + "num_tokens": 403909617.0, + "step": 10583 + }, + { + "epoch": 1.3463935886019591, + "grad_norm": 1.4720615148544312, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8860135078430176, + "num_tokens": 403949788.0, + "step": 10584 + }, + { + "epoch": 1.3465207988805497, + "grad_norm": 1.6304519176483154, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8803317546844482, + "num_tokens": 403984323.0, + "step": 10585 + }, + { + "epoch": 1.3466480091591402, + "grad_norm": 1.5215171575546265, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8804365396499634, + "num_tokens": 404022748.0, + "step": 10586 + }, + { + "epoch": 1.3467752194377305, + "grad_norm": 1.618286371231079, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8685715198516846, + "num_tokens": 404060690.0, + "step": 10587 + }, + { + "epoch": 1.346902429716321, + "grad_norm": 1.4583609104156494, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8784152269363403, + "num_tokens": 404100093.0, + "step": 10588 + }, + { + "epoch": 1.3470296399949115, + "grad_norm": 1.4859756231307983, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8943883180618286, + "num_tokens": 404133175.0, + "step": 10589 + }, + { + "epoch": 1.347156850273502, + "grad_norm": 1.6837843656539917, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8627786636352539, + "num_tokens": 404167125.0, + "step": 10590 + }, + { + "epoch": 1.3472840605520926, + "grad_norm": 1.5142666101455688, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8790176510810852, + "num_tokens": 404206347.0, + "step": 10591 + }, + { + "epoch": 1.3474112708306831, + "grad_norm": 1.5250073671340942, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8798924684524536, + "num_tokens": 404240669.0, + "step": 10592 + }, + { + "epoch": 1.3475384811092737, + "grad_norm": 1.5289031267166138, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8916406631469727, + "num_tokens": 404275936.0, + "step": 10593 + }, + { + "epoch": 1.3476656913878642, + "grad_norm": 1.6121865510940552, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8758290410041809, + "num_tokens": 404312663.0, + "step": 10594 + }, + { + "epoch": 1.3477929016664547, + "grad_norm": 1.615352749824524, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8706141710281372, + "num_tokens": 404349041.0, + "step": 10595 + }, + { + "epoch": 1.3479201119450452, + "grad_norm": 1.4835240840911865, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8822093605995178, + "num_tokens": 404386497.0, + "step": 10596 + }, + { + "epoch": 1.3480473222236355, + "grad_norm": 1.5026274919509888, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8577308654785156, + "num_tokens": 404428110.0, + "step": 10597 + }, + { + "epoch": 1.348174532502226, + "grad_norm": 1.5329986810684204, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8924669623374939, + "num_tokens": 404460617.0, + "step": 10598 + }, + { + "epoch": 1.3483017427808166, + "grad_norm": 1.5383026599884033, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8826998472213745, + "num_tokens": 404494170.0, + "step": 10599 + }, + { + "epoch": 1.3484289530594071, + "grad_norm": 1.5760873556137085, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8860687017440796, + "num_tokens": 404530117.0, + "step": 10600 + }, + { + "epoch": 1.3485561633379977, + "grad_norm": 1.4365434646606445, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8768094182014465, + "num_tokens": 404574145.0, + "step": 10601 + }, + { + "epoch": 1.3486833736165882, + "grad_norm": 1.4814603328704834, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8841376900672913, + "num_tokens": 404611586.0, + "step": 10602 + }, + { + "epoch": 1.3488105838951787, + "grad_norm": 1.4875534772872925, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8780120611190796, + "num_tokens": 404651894.0, + "step": 10603 + }, + { + "epoch": 1.3489377941737692, + "grad_norm": 1.461998701095581, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.886557400226593, + "num_tokens": 404692056.0, + "step": 10604 + }, + { + "epoch": 1.3490650044523598, + "grad_norm": 1.5504988431930542, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8874911069869995, + "num_tokens": 404725506.0, + "step": 10605 + }, + { + "epoch": 1.3491922147309503, + "grad_norm": 1.5578523874282837, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8782505989074707, + "num_tokens": 404763612.0, + "step": 10606 + }, + { + "epoch": 1.3493194250095408, + "grad_norm": 1.4363240003585815, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8842558860778809, + "num_tokens": 404804346.0, + "step": 10607 + }, + { + "epoch": 1.3494466352881314, + "grad_norm": 1.7208223342895508, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8670116662979126, + "num_tokens": 404836669.0, + "step": 10608 + }, + { + "epoch": 1.3495738455667219, + "grad_norm": 1.5078365802764893, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8751884698867798, + "num_tokens": 404876392.0, + "step": 10609 + }, + { + "epoch": 1.3497010558453124, + "grad_norm": 1.2994940280914307, + "learning_rate": 1e-06, + "loss": 0.2797, + "mean_token_accuracy": 0.8999845385551453, + "num_tokens": 404919154.0, + "step": 10610 + }, + { + "epoch": 1.349828266123903, + "grad_norm": 1.5463451147079468, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8565160036087036, + "num_tokens": 404961867.0, + "step": 10611 + }, + { + "epoch": 1.3499554764024932, + "grad_norm": 1.435580849647522, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8752460479736328, + "num_tokens": 405000453.0, + "step": 10612 + }, + { + "epoch": 1.3500826866810838, + "grad_norm": 1.4503633975982666, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8761276006698608, + "num_tokens": 405039859.0, + "step": 10613 + }, + { + "epoch": 1.3502098969596743, + "grad_norm": 1.5064027309417725, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8611710667610168, + "num_tokens": 405077849.0, + "step": 10614 + }, + { + "epoch": 1.3503371072382648, + "grad_norm": 1.5850446224212646, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.865052342414856, + "num_tokens": 405115885.0, + "step": 10615 + }, + { + "epoch": 1.3504643175168554, + "grad_norm": 1.4061113595962524, + "learning_rate": 1e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.8956879377365112, + "num_tokens": 405154374.0, + "step": 10616 + }, + { + "epoch": 1.350591527795446, + "grad_norm": 1.5368260145187378, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8834670782089233, + "num_tokens": 405187369.0, + "step": 10617 + }, + { + "epoch": 1.3507187380740364, + "grad_norm": 1.477133870124817, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.873355507850647, + "num_tokens": 405229211.0, + "step": 10618 + }, + { + "epoch": 1.350845948352627, + "grad_norm": 1.438834309577942, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8923365473747253, + "num_tokens": 405266356.0, + "step": 10619 + }, + { + "epoch": 1.3509731586312175, + "grad_norm": 1.6258933544158936, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8631627559661865, + "num_tokens": 405300996.0, + "step": 10620 + }, + { + "epoch": 1.3511003689098078, + "grad_norm": 1.4785873889923096, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8796716332435608, + "num_tokens": 405339349.0, + "step": 10621 + }, + { + "epoch": 1.3512275791883983, + "grad_norm": 1.5807287693023682, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.885709285736084, + "num_tokens": 405374090.0, + "step": 10622 + }, + { + "epoch": 1.3513547894669888, + "grad_norm": 1.490820050239563, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8752967119216919, + "num_tokens": 405412513.0, + "step": 10623 + }, + { + "epoch": 1.3514819997455794, + "grad_norm": 1.631559133529663, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8602622151374817, + "num_tokens": 405446284.0, + "step": 10624 + }, + { + "epoch": 1.35160921002417, + "grad_norm": 1.489480972290039, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8791664242744446, + "num_tokens": 405484928.0, + "step": 10625 + }, + { + "epoch": 1.3517364203027604, + "grad_norm": 1.6057102680206299, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8742085099220276, + "num_tokens": 405518765.0, + "step": 10626 + }, + { + "epoch": 1.351863630581351, + "grad_norm": 1.4999271631240845, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8760721683502197, + "num_tokens": 405556174.0, + "step": 10627 + }, + { + "epoch": 1.3519908408599415, + "grad_norm": 1.473210096359253, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8757752180099487, + "num_tokens": 405596643.0, + "step": 10628 + }, + { + "epoch": 1.352118051138532, + "grad_norm": 1.4219970703125, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8719069361686707, + "num_tokens": 405639027.0, + "step": 10629 + }, + { + "epoch": 1.3522452614171225, + "grad_norm": 1.5636850595474243, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8702744245529175, + "num_tokens": 405677444.0, + "step": 10630 + }, + { + "epoch": 1.352372471695713, + "grad_norm": 1.5849502086639404, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8791613578796387, + "num_tokens": 405716173.0, + "step": 10631 + }, + { + "epoch": 1.3524996819743036, + "grad_norm": 1.5896321535110474, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8744373917579651, + "num_tokens": 405749937.0, + "step": 10632 + }, + { + "epoch": 1.3526268922528941, + "grad_norm": 1.4910026788711548, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.875004768371582, + "num_tokens": 405789892.0, + "step": 10633 + }, + { + "epoch": 1.3527541025314846, + "grad_norm": 1.4485841989517212, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.883149266242981, + "num_tokens": 405830401.0, + "step": 10634 + }, + { + "epoch": 1.3528813128100752, + "grad_norm": 1.3607076406478882, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8830204010009766, + "num_tokens": 405872877.0, + "step": 10635 + }, + { + "epoch": 1.3530085230886655, + "grad_norm": 1.5118416547775269, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8793314099311829, + "num_tokens": 405909696.0, + "step": 10636 + }, + { + "epoch": 1.353135733367256, + "grad_norm": 1.537261724472046, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8800612688064575, + "num_tokens": 405949315.0, + "step": 10637 + }, + { + "epoch": 1.3532629436458465, + "grad_norm": 1.5053929090499878, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8744298815727234, + "num_tokens": 405990572.0, + "step": 10638 + }, + { + "epoch": 1.353390153924437, + "grad_norm": 1.5438413619995117, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8672986030578613, + "num_tokens": 406035658.0, + "step": 10639 + }, + { + "epoch": 1.3535173642030276, + "grad_norm": 1.356492519378662, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8783251643180847, + "num_tokens": 406081713.0, + "step": 10640 + }, + { + "epoch": 1.3536445744816181, + "grad_norm": 1.5173423290252686, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8677142262458801, + "num_tokens": 406117084.0, + "step": 10641 + }, + { + "epoch": 1.3537717847602087, + "grad_norm": 1.640677571296692, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8667688369750977, + "num_tokens": 406150265.0, + "step": 10642 + }, + { + "epoch": 1.3538989950387992, + "grad_norm": 1.3918503522872925, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8931304216384888, + "num_tokens": 406189679.0, + "step": 10643 + }, + { + "epoch": 1.3540262053173897, + "grad_norm": 1.4906706809997559, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8657950758934021, + "num_tokens": 406231657.0, + "step": 10644 + }, + { + "epoch": 1.3541534155959802, + "grad_norm": 1.5504858493804932, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8793340921401978, + "num_tokens": 406270897.0, + "step": 10645 + }, + { + "epoch": 1.3542806258745705, + "grad_norm": 1.4689910411834717, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8843355178833008, + "num_tokens": 406310592.0, + "step": 10646 + }, + { + "epoch": 1.354407836153161, + "grad_norm": 1.6235398054122925, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8616927862167358, + "num_tokens": 406344816.0, + "step": 10647 + }, + { + "epoch": 1.3545350464317516, + "grad_norm": 1.429513931274414, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8832355737686157, + "num_tokens": 406387086.0, + "step": 10648 + }, + { + "epoch": 1.3546622567103421, + "grad_norm": 1.7722500562667847, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8683359026908875, + "num_tokens": 406421820.0, + "step": 10649 + }, + { + "epoch": 1.3547894669889327, + "grad_norm": 1.4993358850479126, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8836932182312012, + "num_tokens": 406457981.0, + "step": 10650 + }, + { + "epoch": 1.3549166772675232, + "grad_norm": 1.3675793409347534, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8937497138977051, + "num_tokens": 406498733.0, + "step": 10651 + }, + { + "epoch": 1.3550438875461137, + "grad_norm": 1.4372611045837402, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8772059082984924, + "num_tokens": 406538627.0, + "step": 10652 + }, + { + "epoch": 1.3551710978247042, + "grad_norm": 1.4988094568252563, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8774170875549316, + "num_tokens": 406579778.0, + "step": 10653 + }, + { + "epoch": 1.3552983081032948, + "grad_norm": 1.4854968786239624, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8838458061218262, + "num_tokens": 406618286.0, + "step": 10654 + }, + { + "epoch": 1.3554255183818853, + "grad_norm": 1.535749912261963, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8703035116195679, + "num_tokens": 406657615.0, + "step": 10655 + }, + { + "epoch": 1.3555527286604758, + "grad_norm": 1.6959373950958252, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8709434866905212, + "num_tokens": 406692809.0, + "step": 10656 + }, + { + "epoch": 1.3556799389390664, + "grad_norm": 1.5955520868301392, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8695462942123413, + "num_tokens": 406727792.0, + "step": 10657 + }, + { + "epoch": 1.3558071492176569, + "grad_norm": 1.5391007661819458, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8832935094833374, + "num_tokens": 406766310.0, + "step": 10658 + }, + { + "epoch": 1.3559343594962474, + "grad_norm": 1.3871960639953613, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8990136981010437, + "num_tokens": 406805011.0, + "step": 10659 + }, + { + "epoch": 1.356061569774838, + "grad_norm": 1.6623460054397583, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8649606704711914, + "num_tokens": 406841058.0, + "step": 10660 + }, + { + "epoch": 1.3561887800534282, + "grad_norm": 1.4107903242111206, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.879260778427124, + "num_tokens": 406884125.0, + "step": 10661 + }, + { + "epoch": 1.3563159903320188, + "grad_norm": 1.4188510179519653, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8792563676834106, + "num_tokens": 406929929.0, + "step": 10662 + }, + { + "epoch": 1.3564432006106093, + "grad_norm": 1.443084478378296, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8714766502380371, + "num_tokens": 406971114.0, + "step": 10663 + }, + { + "epoch": 1.3565704108891998, + "grad_norm": 1.5679078102111816, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8686826229095459, + "num_tokens": 407006429.0, + "step": 10664 + }, + { + "epoch": 1.3566976211677904, + "grad_norm": 1.4433543682098389, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8775845766067505, + "num_tokens": 407048087.0, + "step": 10665 + }, + { + "epoch": 1.3568248314463809, + "grad_norm": 1.3979636430740356, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8781096935272217, + "num_tokens": 407090599.0, + "step": 10666 + }, + { + "epoch": 1.3569520417249714, + "grad_norm": 1.4236091375350952, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8826069831848145, + "num_tokens": 407130671.0, + "step": 10667 + }, + { + "epoch": 1.357079252003562, + "grad_norm": 1.472777247428894, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8867431879043579, + "num_tokens": 407167401.0, + "step": 10668 + }, + { + "epoch": 1.3572064622821525, + "grad_norm": 1.6962116956710815, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8696496486663818, + "num_tokens": 407201041.0, + "step": 10669 + }, + { + "epoch": 1.3573336725607428, + "grad_norm": 1.5101810693740845, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8881763219833374, + "num_tokens": 407238193.0, + "step": 10670 + }, + { + "epoch": 1.3574608828393333, + "grad_norm": 1.5182093381881714, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8802375793457031, + "num_tokens": 407276166.0, + "step": 10671 + }, + { + "epoch": 1.3575880931179238, + "grad_norm": 1.5582929849624634, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8863496780395508, + "num_tokens": 407310479.0, + "step": 10672 + }, + { + "epoch": 1.3577153033965144, + "grad_norm": 1.5365382432937622, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8825200796127319, + "num_tokens": 407343865.0, + "step": 10673 + }, + { + "epoch": 1.357842513675105, + "grad_norm": 1.5742377042770386, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8596327900886536, + "num_tokens": 407378087.0, + "step": 10674 + }, + { + "epoch": 1.3579697239536954, + "grad_norm": 1.5215879678726196, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8718380928039551, + "num_tokens": 407418558.0, + "step": 10675 + }, + { + "epoch": 1.358096934232286, + "grad_norm": 1.517262578010559, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8777482509613037, + "num_tokens": 407460795.0, + "step": 10676 + }, + { + "epoch": 1.3582241445108765, + "grad_norm": 1.515241026878357, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8703691363334656, + "num_tokens": 407501957.0, + "step": 10677 + }, + { + "epoch": 1.358351354789467, + "grad_norm": 1.593813180923462, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8646935224533081, + "num_tokens": 407540823.0, + "step": 10678 + }, + { + "epoch": 1.3584785650680575, + "grad_norm": 1.5478814840316772, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8782578706741333, + "num_tokens": 407576216.0, + "step": 10679 + }, + { + "epoch": 1.358605775346648, + "grad_norm": 1.5355583429336548, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8700410723686218, + "num_tokens": 407616575.0, + "step": 10680 + }, + { + "epoch": 1.3587329856252386, + "grad_norm": 1.7232012748718262, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8559380769729614, + "num_tokens": 407649582.0, + "step": 10681 + }, + { + "epoch": 1.3588601959038291, + "grad_norm": 1.6791877746582031, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8796057105064392, + "num_tokens": 407679842.0, + "step": 10682 + }, + { + "epoch": 1.3589874061824196, + "grad_norm": 1.4177680015563965, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8841679096221924, + "num_tokens": 407718827.0, + "step": 10683 + }, + { + "epoch": 1.3591146164610102, + "grad_norm": 1.5299919843673706, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8746572732925415, + "num_tokens": 407761682.0, + "step": 10684 + }, + { + "epoch": 1.3592418267396005, + "grad_norm": 1.6348952054977417, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8834302425384521, + "num_tokens": 407795897.0, + "step": 10685 + }, + { + "epoch": 1.359369037018191, + "grad_norm": 1.3983863592147827, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8847424983978271, + "num_tokens": 407834652.0, + "step": 10686 + }, + { + "epoch": 1.3594962472967815, + "grad_norm": 1.4769004583358765, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8645310997962952, + "num_tokens": 407876338.0, + "step": 10687 + }, + { + "epoch": 1.359623457575372, + "grad_norm": 1.5416759252548218, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8689886331558228, + "num_tokens": 407912120.0, + "step": 10688 + }, + { + "epoch": 1.3597506678539626, + "grad_norm": 1.519424319267273, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8777443170547485, + "num_tokens": 407949649.0, + "step": 10689 + }, + { + "epoch": 1.3598778781325531, + "grad_norm": 1.4394214153289795, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8866689205169678, + "num_tokens": 407986685.0, + "step": 10690 + }, + { + "epoch": 1.3600050884111436, + "grad_norm": 1.491830587387085, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8879668712615967, + "num_tokens": 408023512.0, + "step": 10691 + }, + { + "epoch": 1.3601322986897342, + "grad_norm": 1.4352662563323975, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8813580274581909, + "num_tokens": 408064844.0, + "step": 10692 + }, + { + "epoch": 1.3602595089683247, + "grad_norm": 1.5545027256011963, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8653252720832825, + "num_tokens": 408103216.0, + "step": 10693 + }, + { + "epoch": 1.3603867192469152, + "grad_norm": 1.5620380640029907, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8755745887756348, + "num_tokens": 408137235.0, + "step": 10694 + }, + { + "epoch": 1.3605139295255055, + "grad_norm": 1.5117912292480469, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8817445039749146, + "num_tokens": 408171308.0, + "step": 10695 + }, + { + "epoch": 1.360641139804096, + "grad_norm": 1.3946876525878906, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8749617338180542, + "num_tokens": 408215107.0, + "step": 10696 + }, + { + "epoch": 1.3607683500826866, + "grad_norm": 1.5590626001358032, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8866786360740662, + "num_tokens": 408250855.0, + "step": 10697 + }, + { + "epoch": 1.3608955603612771, + "grad_norm": 1.697503685951233, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8798007965087891, + "num_tokens": 408287211.0, + "step": 10698 + }, + { + "epoch": 1.3610227706398677, + "grad_norm": 1.5969407558441162, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8745909333229065, + "num_tokens": 408323126.0, + "step": 10699 + }, + { + "epoch": 1.3611499809184582, + "grad_norm": 1.6233928203582764, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8833551406860352, + "num_tokens": 408353389.0, + "step": 10700 + }, + { + "epoch": 1.3612771911970487, + "grad_norm": 1.5800734758377075, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8721305131912231, + "num_tokens": 408388632.0, + "step": 10701 + }, + { + "epoch": 1.3614044014756392, + "grad_norm": 1.5028867721557617, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8616961240768433, + "num_tokens": 408427359.0, + "step": 10702 + }, + { + "epoch": 1.3615316117542298, + "grad_norm": 1.5034946203231812, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8763179183006287, + "num_tokens": 408471699.0, + "step": 10703 + }, + { + "epoch": 1.3616588220328203, + "grad_norm": 1.711394190788269, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8569966554641724, + "num_tokens": 408507320.0, + "step": 10704 + }, + { + "epoch": 1.3617860323114108, + "grad_norm": 1.6098576784133911, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8819705247879028, + "num_tokens": 408542655.0, + "step": 10705 + }, + { + "epoch": 1.3619132425900013, + "grad_norm": 1.53523588180542, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8798047304153442, + "num_tokens": 408581561.0, + "step": 10706 + }, + { + "epoch": 1.3620404528685919, + "grad_norm": 1.5174651145935059, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.890690803527832, + "num_tokens": 408616926.0, + "step": 10707 + }, + { + "epoch": 1.3621676631471824, + "grad_norm": 1.4293307065963745, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8770714998245239, + "num_tokens": 408656611.0, + "step": 10708 + }, + { + "epoch": 1.362294873425773, + "grad_norm": 1.6743563413619995, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.866530179977417, + "num_tokens": 408688439.0, + "step": 10709 + }, + { + "epoch": 1.3624220837043632, + "grad_norm": 1.3881621360778809, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8864002227783203, + "num_tokens": 408727271.0, + "step": 10710 + }, + { + "epoch": 1.3625492939829538, + "grad_norm": 1.5594850778579712, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8679599761962891, + "num_tokens": 408765586.0, + "step": 10711 + }, + { + "epoch": 1.3626765042615443, + "grad_norm": 1.5473501682281494, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.865123987197876, + "num_tokens": 408802368.0, + "step": 10712 + }, + { + "epoch": 1.3628037145401348, + "grad_norm": 1.5188953876495361, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8828123807907104, + "num_tokens": 408836999.0, + "step": 10713 + }, + { + "epoch": 1.3629309248187254, + "grad_norm": 1.4911911487579346, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8849352598190308, + "num_tokens": 408874331.0, + "step": 10714 + }, + { + "epoch": 1.3630581350973159, + "grad_norm": 1.5974712371826172, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.881189227104187, + "num_tokens": 408904506.0, + "step": 10715 + }, + { + "epoch": 1.3631853453759064, + "grad_norm": 1.4945608377456665, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8676998019218445, + "num_tokens": 408943649.0, + "step": 10716 + }, + { + "epoch": 1.363312555654497, + "grad_norm": 1.4229652881622314, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.871934711933136, + "num_tokens": 408984292.0, + "step": 10717 + }, + { + "epoch": 1.3634397659330875, + "grad_norm": 1.5088123083114624, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8749775886535645, + "num_tokens": 409020758.0, + "step": 10718 + }, + { + "epoch": 1.3635669762116778, + "grad_norm": 1.5580934286117554, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8583064079284668, + "num_tokens": 409061157.0, + "step": 10719 + }, + { + "epoch": 1.3636941864902683, + "grad_norm": 1.5969651937484741, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8782728910446167, + "num_tokens": 409096888.0, + "step": 10720 + }, + { + "epoch": 1.3638213967688588, + "grad_norm": 1.4534227848052979, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8787907958030701, + "num_tokens": 409135664.0, + "step": 10721 + }, + { + "epoch": 1.3639486070474494, + "grad_norm": 1.506692886352539, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8815848231315613, + "num_tokens": 409170950.0, + "step": 10722 + }, + { + "epoch": 1.3640758173260399, + "grad_norm": 1.510373592376709, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8779265880584717, + "num_tokens": 409207090.0, + "step": 10723 + }, + { + "epoch": 1.3642030276046304, + "grad_norm": 1.5383968353271484, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8733811378479004, + "num_tokens": 409243216.0, + "step": 10724 + }, + { + "epoch": 1.364330237883221, + "grad_norm": 1.6573853492736816, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8773751258850098, + "num_tokens": 409276252.0, + "step": 10725 + }, + { + "epoch": 1.3644574481618115, + "grad_norm": 1.4985989332199097, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8662950992584229, + "num_tokens": 409315404.0, + "step": 10726 + }, + { + "epoch": 1.364584658440402, + "grad_norm": 1.5494771003723145, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8698078393936157, + "num_tokens": 409354307.0, + "step": 10727 + }, + { + "epoch": 1.3647118687189925, + "grad_norm": 1.4588547945022583, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8880389928817749, + "num_tokens": 409391906.0, + "step": 10728 + }, + { + "epoch": 1.364839078997583, + "grad_norm": 1.6094801425933838, + "learning_rate": 1e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.894226610660553, + "num_tokens": 409425511.0, + "step": 10729 + }, + { + "epoch": 1.3649662892761736, + "grad_norm": 1.5015816688537598, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8761103749275208, + "num_tokens": 409464093.0, + "step": 10730 + }, + { + "epoch": 1.3650934995547641, + "grad_norm": 1.3872785568237305, + "learning_rate": 1e-06, + "loss": 0.2798, + "mean_token_accuracy": 0.9007798433303833, + "num_tokens": 409502744.0, + "step": 10731 + }, + { + "epoch": 1.3652207098333546, + "grad_norm": 1.500910997390747, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8668684959411621, + "num_tokens": 409547406.0, + "step": 10732 + }, + { + "epoch": 1.3653479201119452, + "grad_norm": 1.505406141281128, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8760254383087158, + "num_tokens": 409584487.0, + "step": 10733 + }, + { + "epoch": 1.3654751303905355, + "grad_norm": 1.5506397485733032, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8726428747177124, + "num_tokens": 409620478.0, + "step": 10734 + }, + { + "epoch": 1.365602340669126, + "grad_norm": 1.5363746881484985, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8737142086029053, + "num_tokens": 409660645.0, + "step": 10735 + }, + { + "epoch": 1.3657295509477165, + "grad_norm": 1.6329210996627808, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8783860206604004, + "num_tokens": 409696556.0, + "step": 10736 + }, + { + "epoch": 1.365856761226307, + "grad_norm": 1.4881417751312256, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8766844272613525, + "num_tokens": 409735288.0, + "step": 10737 + }, + { + "epoch": 1.3659839715048976, + "grad_norm": 1.4474098682403564, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8943758010864258, + "num_tokens": 409776151.0, + "step": 10738 + }, + { + "epoch": 1.3661111817834881, + "grad_norm": 1.3643044233322144, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8909865617752075, + "num_tokens": 409815032.0, + "step": 10739 + }, + { + "epoch": 1.3662383920620786, + "grad_norm": 1.3503963947296143, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8760725259780884, + "num_tokens": 409858870.0, + "step": 10740 + }, + { + "epoch": 1.3663656023406692, + "grad_norm": 1.4385764598846436, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8725394010543823, + "num_tokens": 409899563.0, + "step": 10741 + }, + { + "epoch": 1.3664928126192597, + "grad_norm": 1.3961379528045654, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8790284395217896, + "num_tokens": 409943752.0, + "step": 10742 + }, + { + "epoch": 1.3666200228978502, + "grad_norm": 1.4631321430206299, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.873782753944397, + "num_tokens": 409984888.0, + "step": 10743 + }, + { + "epoch": 1.3667472331764405, + "grad_norm": 1.4641940593719482, + "learning_rate": 1e-06, + "loss": 0.2902, + "mean_token_accuracy": 0.8960000276565552, + "num_tokens": 410019635.0, + "step": 10744 + }, + { + "epoch": 1.366874443455031, + "grad_norm": 1.5134233236312866, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8918142318725586, + "num_tokens": 410053570.0, + "step": 10745 + }, + { + "epoch": 1.3670016537336216, + "grad_norm": 1.3691465854644775, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8903215527534485, + "num_tokens": 410092519.0, + "step": 10746 + }, + { + "epoch": 1.3671288640122121, + "grad_norm": 1.4999843835830688, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8908355236053467, + "num_tokens": 410128729.0, + "step": 10747 + }, + { + "epoch": 1.3672560742908026, + "grad_norm": 1.5069605112075806, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8699580430984497, + "num_tokens": 410168601.0, + "step": 10748 + }, + { + "epoch": 1.3673832845693932, + "grad_norm": 1.4467750787734985, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8763513565063477, + "num_tokens": 410210726.0, + "step": 10749 + }, + { + "epoch": 1.3675104948479837, + "grad_norm": 1.6874110698699951, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8616361021995544, + "num_tokens": 410244626.0, + "step": 10750 + }, + { + "epoch": 1.3676377051265742, + "grad_norm": 1.6152552366256714, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8731226325035095, + "num_tokens": 410282594.0, + "step": 10751 + }, + { + "epoch": 1.3677649154051648, + "grad_norm": 1.523118495941162, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8855233192443848, + "num_tokens": 410319333.0, + "step": 10752 + }, + { + "epoch": 1.3678921256837553, + "grad_norm": 1.6404190063476562, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8836547136306763, + "num_tokens": 410353189.0, + "step": 10753 + }, + { + "epoch": 1.3680193359623458, + "grad_norm": 1.565988540649414, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8797128200531006, + "num_tokens": 410390247.0, + "step": 10754 + }, + { + "epoch": 1.3681465462409363, + "grad_norm": 1.5227190256118774, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8927749395370483, + "num_tokens": 410430445.0, + "step": 10755 + }, + { + "epoch": 1.3682737565195269, + "grad_norm": 1.5303267240524292, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8730741739273071, + "num_tokens": 410466024.0, + "step": 10756 + }, + { + "epoch": 1.3684009667981174, + "grad_norm": 1.3907166719436646, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8731915354728699, + "num_tokens": 410513917.0, + "step": 10757 + }, + { + "epoch": 1.368528177076708, + "grad_norm": 1.573840856552124, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8820799589157104, + "num_tokens": 410546768.0, + "step": 10758 + }, + { + "epoch": 1.3686553873552982, + "grad_norm": 1.557589054107666, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8808684349060059, + "num_tokens": 410580593.0, + "step": 10759 + }, + { + "epoch": 1.3687825976338888, + "grad_norm": 1.6202951669692993, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8716068863868713, + "num_tokens": 410616522.0, + "step": 10760 + }, + { + "epoch": 1.3689098079124793, + "grad_norm": 1.4943522214889526, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.867332935333252, + "num_tokens": 410659904.0, + "step": 10761 + }, + { + "epoch": 1.3690370181910698, + "grad_norm": 1.5198646783828735, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8795256614685059, + "num_tokens": 410699270.0, + "step": 10762 + }, + { + "epoch": 1.3691642284696603, + "grad_norm": 1.5760434865951538, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.879439651966095, + "num_tokens": 410735080.0, + "step": 10763 + }, + { + "epoch": 1.3692914387482509, + "grad_norm": 1.4699369668960571, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8814013004302979, + "num_tokens": 410773931.0, + "step": 10764 + }, + { + "epoch": 1.3694186490268414, + "grad_norm": 1.5779502391815186, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8728561401367188, + "num_tokens": 410809512.0, + "step": 10765 + }, + { + "epoch": 1.369545859305432, + "grad_norm": 1.6531474590301514, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8593040704727173, + "num_tokens": 410846453.0, + "step": 10766 + }, + { + "epoch": 1.3696730695840225, + "grad_norm": 1.625296950340271, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8691846132278442, + "num_tokens": 410883365.0, + "step": 10767 + }, + { + "epoch": 1.3698002798626128, + "grad_norm": 1.4583749771118164, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8762255311012268, + "num_tokens": 410923201.0, + "step": 10768 + }, + { + "epoch": 1.3699274901412033, + "grad_norm": 1.4577406644821167, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8789170384407043, + "num_tokens": 410962866.0, + "step": 10769 + }, + { + "epoch": 1.3700547004197938, + "grad_norm": 1.5029425621032715, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8637144565582275, + "num_tokens": 411004647.0, + "step": 10770 + }, + { + "epoch": 1.3701819106983844, + "grad_norm": 1.522214651107788, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8812504410743713, + "num_tokens": 411041976.0, + "step": 10771 + }, + { + "epoch": 1.3703091209769749, + "grad_norm": 1.5287505388259888, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8833997845649719, + "num_tokens": 411076683.0, + "step": 10772 + }, + { + "epoch": 1.3704363312555654, + "grad_norm": 1.5322271585464478, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8737096786499023, + "num_tokens": 411112431.0, + "step": 10773 + }, + { + "epoch": 1.370563541534156, + "grad_norm": 1.6520113945007324, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8620821237564087, + "num_tokens": 411146533.0, + "step": 10774 + }, + { + "epoch": 1.3706907518127465, + "grad_norm": 1.584644079208374, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8659931421279907, + "num_tokens": 411183109.0, + "step": 10775 + }, + { + "epoch": 1.370817962091337, + "grad_norm": 1.5235142707824707, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8790616393089294, + "num_tokens": 411219468.0, + "step": 10776 + }, + { + "epoch": 1.3709451723699275, + "grad_norm": 1.4927610158920288, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8719938397407532, + "num_tokens": 411259005.0, + "step": 10777 + }, + { + "epoch": 1.371072382648518, + "grad_norm": 1.4413831233978271, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8789856433868408, + "num_tokens": 411297145.0, + "step": 10778 + }, + { + "epoch": 1.3711995929271086, + "grad_norm": 1.430740475654602, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8618596196174622, + "num_tokens": 411343468.0, + "step": 10779 + }, + { + "epoch": 1.371326803205699, + "grad_norm": 1.4809073209762573, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8771830797195435, + "num_tokens": 411382785.0, + "step": 10780 + }, + { + "epoch": 1.3714540134842896, + "grad_norm": 1.4604132175445557, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8731966018676758, + "num_tokens": 411425750.0, + "step": 10781 + }, + { + "epoch": 1.3715812237628802, + "grad_norm": 1.4969698190689087, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.883686900138855, + "num_tokens": 411461659.0, + "step": 10782 + }, + { + "epoch": 1.3717084340414705, + "grad_norm": 1.550472378730774, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8563727140426636, + "num_tokens": 411502147.0, + "step": 10783 + }, + { + "epoch": 1.371835644320061, + "grad_norm": 1.5335872173309326, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8797472715377808, + "num_tokens": 411539535.0, + "step": 10784 + }, + { + "epoch": 1.3719628545986515, + "grad_norm": 1.6868692636489868, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8710360527038574, + "num_tokens": 411576439.0, + "step": 10785 + }, + { + "epoch": 1.372090064877242, + "grad_norm": 1.586242914199829, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8886967897415161, + "num_tokens": 411613701.0, + "step": 10786 + }, + { + "epoch": 1.3722172751558326, + "grad_norm": 1.470658779144287, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8616677522659302, + "num_tokens": 411657372.0, + "step": 10787 + }, + { + "epoch": 1.3723444854344231, + "grad_norm": 1.3873634338378906, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8902848362922668, + "num_tokens": 411697589.0, + "step": 10788 + }, + { + "epoch": 1.3724716957130136, + "grad_norm": 1.5544475317001343, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8605412244796753, + "num_tokens": 411739257.0, + "step": 10789 + }, + { + "epoch": 1.3725989059916042, + "grad_norm": 1.5361404418945312, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8794858455657959, + "num_tokens": 411774242.0, + "step": 10790 + }, + { + "epoch": 1.3727261162701947, + "grad_norm": 1.4555176496505737, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8941115736961365, + "num_tokens": 411809360.0, + "step": 10791 + }, + { + "epoch": 1.3728533265487852, + "grad_norm": 1.5291156768798828, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.860878050327301, + "num_tokens": 411848405.0, + "step": 10792 + }, + { + "epoch": 1.3729805368273755, + "grad_norm": 1.5778071880340576, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8787941932678223, + "num_tokens": 411885232.0, + "step": 10793 + }, + { + "epoch": 1.373107747105966, + "grad_norm": 1.6683106422424316, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.866934061050415, + "num_tokens": 411922682.0, + "step": 10794 + }, + { + "epoch": 1.3732349573845566, + "grad_norm": 1.6125173568725586, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8838452100753784, + "num_tokens": 411956658.0, + "step": 10795 + }, + { + "epoch": 1.3733621676631471, + "grad_norm": 1.5525248050689697, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.874261736869812, + "num_tokens": 411996454.0, + "step": 10796 + }, + { + "epoch": 1.3734893779417376, + "grad_norm": 1.489344596862793, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8708041906356812, + "num_tokens": 412039076.0, + "step": 10797 + }, + { + "epoch": 1.3736165882203282, + "grad_norm": 1.3633586168289185, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8797673583030701, + "num_tokens": 412084999.0, + "step": 10798 + }, + { + "epoch": 1.3737437984989187, + "grad_norm": 1.4554111957550049, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8727095127105713, + "num_tokens": 412127020.0, + "step": 10799 + }, + { + "epoch": 1.3738710087775092, + "grad_norm": 1.5825947523117065, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8771700263023376, + "num_tokens": 412164551.0, + "step": 10800 + }, + { + "epoch": 1.3739982190560998, + "grad_norm": 1.57893705368042, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8932916522026062, + "num_tokens": 412198854.0, + "step": 10801 + }, + { + "epoch": 1.3741254293346903, + "grad_norm": 1.4071871042251587, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8900882005691528, + "num_tokens": 412236209.0, + "step": 10802 + }, + { + "epoch": 1.3742526396132808, + "grad_norm": 1.4489933252334595, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8690609931945801, + "num_tokens": 412275026.0, + "step": 10803 + }, + { + "epoch": 1.3743798498918713, + "grad_norm": 1.5564894676208496, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.876189112663269, + "num_tokens": 412310989.0, + "step": 10804 + }, + { + "epoch": 1.3745070601704619, + "grad_norm": 1.6337603330612183, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8646645545959473, + "num_tokens": 412345023.0, + "step": 10805 + }, + { + "epoch": 1.3746342704490524, + "grad_norm": 1.6361814737319946, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8714646100997925, + "num_tokens": 412378578.0, + "step": 10806 + }, + { + "epoch": 1.374761480727643, + "grad_norm": 1.588606595993042, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8774940967559814, + "num_tokens": 412417278.0, + "step": 10807 + }, + { + "epoch": 1.3748886910062332, + "grad_norm": 1.4657831192016602, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.887285590171814, + "num_tokens": 412455586.0, + "step": 10808 + }, + { + "epoch": 1.3750159012848238, + "grad_norm": 1.642087697982788, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8643604516983032, + "num_tokens": 412493549.0, + "step": 10809 + }, + { + "epoch": 1.3751431115634143, + "grad_norm": 1.515718698501587, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8711986541748047, + "num_tokens": 412531176.0, + "step": 10810 + }, + { + "epoch": 1.3752703218420048, + "grad_norm": 1.5802842378616333, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8513767719268799, + "num_tokens": 412575295.0, + "step": 10811 + }, + { + "epoch": 1.3753975321205953, + "grad_norm": 1.5475529432296753, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8753653168678284, + "num_tokens": 412613329.0, + "step": 10812 + }, + { + "epoch": 1.3755247423991859, + "grad_norm": 1.53348970413208, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8874708414077759, + "num_tokens": 412646978.0, + "step": 10813 + }, + { + "epoch": 1.3756519526777764, + "grad_norm": 1.5936884880065918, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8700056076049805, + "num_tokens": 412685210.0, + "step": 10814 + }, + { + "epoch": 1.375779162956367, + "grad_norm": 1.5495402812957764, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8833957314491272, + "num_tokens": 412720224.0, + "step": 10815 + }, + { + "epoch": 1.3759063732349575, + "grad_norm": 1.4611736536026, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.885938286781311, + "num_tokens": 412759932.0, + "step": 10816 + }, + { + "epoch": 1.3760335835135478, + "grad_norm": 1.7323460578918457, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8451830744743347, + "num_tokens": 412795619.0, + "step": 10817 + }, + { + "epoch": 1.3761607937921383, + "grad_norm": 1.4008532762527466, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8903276920318604, + "num_tokens": 412833630.0, + "step": 10818 + }, + { + "epoch": 1.3762880040707288, + "grad_norm": 1.4197235107421875, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8763478994369507, + "num_tokens": 412874563.0, + "step": 10819 + }, + { + "epoch": 1.3764152143493193, + "grad_norm": 1.3818570375442505, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8877310752868652, + "num_tokens": 412914913.0, + "step": 10820 + }, + { + "epoch": 1.3765424246279099, + "grad_norm": 1.4633866548538208, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.871616780757904, + "num_tokens": 412957185.0, + "step": 10821 + }, + { + "epoch": 1.3766696349065004, + "grad_norm": 1.6399569511413574, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8752588033676147, + "num_tokens": 412994363.0, + "step": 10822 + }, + { + "epoch": 1.376796845185091, + "grad_norm": 1.5402082204818726, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8524377942085266, + "num_tokens": 413034288.0, + "step": 10823 + }, + { + "epoch": 1.3769240554636815, + "grad_norm": 1.4464815855026245, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8854771852493286, + "num_tokens": 413076258.0, + "step": 10824 + }, + { + "epoch": 1.377051265742272, + "grad_norm": 1.4933009147644043, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8821985125541687, + "num_tokens": 413112852.0, + "step": 10825 + }, + { + "epoch": 1.3771784760208625, + "grad_norm": 1.422192096710205, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8776767253875732, + "num_tokens": 413155513.0, + "step": 10826 + }, + { + "epoch": 1.377305686299453, + "grad_norm": 1.3875879049301147, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.869075357913971, + "num_tokens": 413201962.0, + "step": 10827 + }, + { + "epoch": 1.3774328965780436, + "grad_norm": 1.553724765777588, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8843889236450195, + "num_tokens": 413240953.0, + "step": 10828 + }, + { + "epoch": 1.377560106856634, + "grad_norm": 1.4807738065719604, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8799551725387573, + "num_tokens": 413282103.0, + "step": 10829 + }, + { + "epoch": 1.3776873171352246, + "grad_norm": 1.4844721555709839, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8678474426269531, + "num_tokens": 413321851.0, + "step": 10830 + }, + { + "epoch": 1.3778145274138152, + "grad_norm": 1.4958877563476562, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8675588369369507, + "num_tokens": 413363058.0, + "step": 10831 + }, + { + "epoch": 1.3779417376924055, + "grad_norm": 1.5096399784088135, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.871475338935852, + "num_tokens": 413405090.0, + "step": 10832 + }, + { + "epoch": 1.378068947970996, + "grad_norm": 1.5321413278579712, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8738909959793091, + "num_tokens": 413443639.0, + "step": 10833 + }, + { + "epoch": 1.3781961582495865, + "grad_norm": 1.3902479410171509, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8793448209762573, + "num_tokens": 413486223.0, + "step": 10834 + }, + { + "epoch": 1.378323368528177, + "grad_norm": 1.5664751529693604, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8716283440589905, + "num_tokens": 413523769.0, + "step": 10835 + }, + { + "epoch": 1.3784505788067676, + "grad_norm": 1.4316911697387695, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8729190826416016, + "num_tokens": 413567672.0, + "step": 10836 + }, + { + "epoch": 1.378577789085358, + "grad_norm": 1.570008635520935, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8758379220962524, + "num_tokens": 413599397.0, + "step": 10837 + }, + { + "epoch": 1.3787049993639486, + "grad_norm": 1.3895033597946167, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8779467344284058, + "num_tokens": 413642279.0, + "step": 10838 + }, + { + "epoch": 1.3788322096425392, + "grad_norm": 1.7975749969482422, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8716692924499512, + "num_tokens": 413674646.0, + "step": 10839 + }, + { + "epoch": 1.3789594199211297, + "grad_norm": 1.742977261543274, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8737471699714661, + "num_tokens": 413706605.0, + "step": 10840 + }, + { + "epoch": 1.3790866301997202, + "grad_norm": 1.4991910457611084, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8742063641548157, + "num_tokens": 413744999.0, + "step": 10841 + }, + { + "epoch": 1.3792138404783105, + "grad_norm": 1.492484211921692, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8599892854690552, + "num_tokens": 413788857.0, + "step": 10842 + }, + { + "epoch": 1.379341050756901, + "grad_norm": 1.4189502000808716, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8801984786987305, + "num_tokens": 413831270.0, + "step": 10843 + }, + { + "epoch": 1.3794682610354916, + "grad_norm": 1.5655767917633057, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8675850033760071, + "num_tokens": 413877229.0, + "step": 10844 + }, + { + "epoch": 1.3795954713140821, + "grad_norm": 1.6396397352218628, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8810293674468994, + "num_tokens": 413914778.0, + "step": 10845 + }, + { + "epoch": 1.3797226815926726, + "grad_norm": 1.4777945280075073, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8733196258544922, + "num_tokens": 413955376.0, + "step": 10846 + }, + { + "epoch": 1.3798498918712632, + "grad_norm": 1.3505957126617432, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8789029717445374, + "num_tokens": 413997360.0, + "step": 10847 + }, + { + "epoch": 1.3799771021498537, + "grad_norm": 1.351836919784546, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8961153030395508, + "num_tokens": 414039079.0, + "step": 10848 + }, + { + "epoch": 1.3801043124284442, + "grad_norm": 1.5238935947418213, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8728935122489929, + "num_tokens": 414074856.0, + "step": 10849 + }, + { + "epoch": 1.3802315227070348, + "grad_norm": 1.4682965278625488, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.883889377117157, + "num_tokens": 414115366.0, + "step": 10850 + }, + { + "epoch": 1.3803587329856253, + "grad_norm": 1.4002641439437866, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8770500421524048, + "num_tokens": 414156737.0, + "step": 10851 + }, + { + "epoch": 1.3804859432642158, + "grad_norm": 1.5101914405822754, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8759929537773132, + "num_tokens": 414194103.0, + "step": 10852 + }, + { + "epoch": 1.3806131535428063, + "grad_norm": 1.4965487718582153, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8799173831939697, + "num_tokens": 414231409.0, + "step": 10853 + }, + { + "epoch": 1.3807403638213969, + "grad_norm": 1.5764397382736206, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8832669258117676, + "num_tokens": 414267918.0, + "step": 10854 + }, + { + "epoch": 1.3808675740999874, + "grad_norm": 1.4973219633102417, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8597802519798279, + "num_tokens": 414308571.0, + "step": 10855 + }, + { + "epoch": 1.380994784378578, + "grad_norm": 1.5560073852539062, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8860386610031128, + "num_tokens": 414341748.0, + "step": 10856 + }, + { + "epoch": 1.3811219946571682, + "grad_norm": 1.34902822971344, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8835188150405884, + "num_tokens": 414383402.0, + "step": 10857 + }, + { + "epoch": 1.3812492049357588, + "grad_norm": 1.5144964456558228, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8711363077163696, + "num_tokens": 414422753.0, + "step": 10858 + }, + { + "epoch": 1.3813764152143493, + "grad_norm": 1.4160418510437012, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8813755512237549, + "num_tokens": 414461167.0, + "step": 10859 + }, + { + "epoch": 1.3815036254929398, + "grad_norm": 1.4477840662002563, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8743079900741577, + "num_tokens": 414499564.0, + "step": 10860 + }, + { + "epoch": 1.3816308357715303, + "grad_norm": 1.4588356018066406, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8701858520507812, + "num_tokens": 414540449.0, + "step": 10861 + }, + { + "epoch": 1.3817580460501209, + "grad_norm": 1.603643536567688, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8875094056129456, + "num_tokens": 414578993.0, + "step": 10862 + }, + { + "epoch": 1.3818852563287114, + "grad_norm": 1.502034068107605, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8658811450004578, + "num_tokens": 414619417.0, + "step": 10863 + }, + { + "epoch": 1.382012466607302, + "grad_norm": 1.4894895553588867, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8714715242385864, + "num_tokens": 414657356.0, + "step": 10864 + }, + { + "epoch": 1.3821396768858925, + "grad_norm": 1.4455660581588745, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8930565118789673, + "num_tokens": 414697247.0, + "step": 10865 + }, + { + "epoch": 1.3822668871644828, + "grad_norm": 1.4169056415557861, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8811904191970825, + "num_tokens": 414739291.0, + "step": 10866 + }, + { + "epoch": 1.3823940974430733, + "grad_norm": 1.552151083946228, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8744205236434937, + "num_tokens": 414778701.0, + "step": 10867 + }, + { + "epoch": 1.3825213077216638, + "grad_norm": 1.4356114864349365, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8698321580886841, + "num_tokens": 414819595.0, + "step": 10868 + }, + { + "epoch": 1.3826485180002543, + "grad_norm": 1.5395187139511108, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8674216866493225, + "num_tokens": 414859726.0, + "step": 10869 + }, + { + "epoch": 1.3827757282788449, + "grad_norm": 1.377380132675171, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8775492906570435, + "num_tokens": 414903715.0, + "step": 10870 + }, + { + "epoch": 1.3829029385574354, + "grad_norm": 1.5487067699432373, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8832650780677795, + "num_tokens": 414938229.0, + "step": 10871 + }, + { + "epoch": 1.383030148836026, + "grad_norm": 1.5166670083999634, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8691486120223999, + "num_tokens": 414974555.0, + "step": 10872 + }, + { + "epoch": 1.3831573591146165, + "grad_norm": 1.5310242176055908, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8793966174125671, + "num_tokens": 415010087.0, + "step": 10873 + }, + { + "epoch": 1.383284569393207, + "grad_norm": 1.5876052379608154, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8791276812553406, + "num_tokens": 415040398.0, + "step": 10874 + }, + { + "epoch": 1.3834117796717975, + "grad_norm": 1.5457426309585571, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8909764289855957, + "num_tokens": 415074771.0, + "step": 10875 + }, + { + "epoch": 1.383538989950388, + "grad_norm": 1.4981474876403809, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8819783926010132, + "num_tokens": 415114929.0, + "step": 10876 + }, + { + "epoch": 1.3836662002289786, + "grad_norm": 1.6061726808547974, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8573192358016968, + "num_tokens": 415151966.0, + "step": 10877 + }, + { + "epoch": 1.383793410507569, + "grad_norm": 1.4484599828720093, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8753818273544312, + "num_tokens": 415192034.0, + "step": 10878 + }, + { + "epoch": 1.3839206207861596, + "grad_norm": 1.5250240564346313, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.871410071849823, + "num_tokens": 415229342.0, + "step": 10879 + }, + { + "epoch": 1.3840478310647502, + "grad_norm": 1.5566579103469849, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8847501873970032, + "num_tokens": 415267008.0, + "step": 10880 + }, + { + "epoch": 1.3841750413433405, + "grad_norm": 1.4406466484069824, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8784956932067871, + "num_tokens": 415307450.0, + "step": 10881 + }, + { + "epoch": 1.384302251621931, + "grad_norm": 1.49050772190094, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8777910470962524, + "num_tokens": 415348253.0, + "step": 10882 + }, + { + "epoch": 1.3844294619005215, + "grad_norm": 1.497531533241272, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8896948099136353, + "num_tokens": 415385281.0, + "step": 10883 + }, + { + "epoch": 1.384556672179112, + "grad_norm": 1.3706718683242798, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8871207237243652, + "num_tokens": 415428564.0, + "step": 10884 + }, + { + "epoch": 1.3846838824577026, + "grad_norm": 1.4950013160705566, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8821495175361633, + "num_tokens": 415467067.0, + "step": 10885 + }, + { + "epoch": 1.384811092736293, + "grad_norm": 1.6531363725662231, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8872618079185486, + "num_tokens": 415501557.0, + "step": 10886 + }, + { + "epoch": 1.3849383030148836, + "grad_norm": 1.4923017024993896, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.865172266960144, + "num_tokens": 415541751.0, + "step": 10887 + }, + { + "epoch": 1.3850655132934742, + "grad_norm": 1.5667309761047363, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8898609280586243, + "num_tokens": 415578117.0, + "step": 10888 + }, + { + "epoch": 1.3851927235720647, + "grad_norm": 1.440276861190796, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.886523425579071, + "num_tokens": 415615534.0, + "step": 10889 + }, + { + "epoch": 1.385319933850655, + "grad_norm": 1.447886347770691, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8857380747795105, + "num_tokens": 415657001.0, + "step": 10890 + }, + { + "epoch": 1.3854471441292455, + "grad_norm": 1.5281319618225098, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8877726197242737, + "num_tokens": 415691903.0, + "step": 10891 + }, + { + "epoch": 1.385574354407836, + "grad_norm": 1.574036717414856, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.867569625377655, + "num_tokens": 415730161.0, + "step": 10892 + }, + { + "epoch": 1.3857015646864266, + "grad_norm": 1.6442501544952393, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8802723288536072, + "num_tokens": 415765406.0, + "step": 10893 + }, + { + "epoch": 1.385828774965017, + "grad_norm": 1.6267333030700684, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8688951730728149, + "num_tokens": 415801447.0, + "step": 10894 + }, + { + "epoch": 1.3859559852436076, + "grad_norm": 1.532248616218567, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8562477231025696, + "num_tokens": 415841195.0, + "step": 10895 + }, + { + "epoch": 1.3860831955221982, + "grad_norm": 1.5686982870101929, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8585430383682251, + "num_tokens": 415881263.0, + "step": 10896 + }, + { + "epoch": 1.3862104058007887, + "grad_norm": 1.6358226537704468, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8703498244285583, + "num_tokens": 415920965.0, + "step": 10897 + }, + { + "epoch": 1.3863376160793792, + "grad_norm": 1.63420832157135, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8624601364135742, + "num_tokens": 415960607.0, + "step": 10898 + }, + { + "epoch": 1.3864648263579697, + "grad_norm": 1.6258769035339355, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8624542355537415, + "num_tokens": 415997325.0, + "step": 10899 + }, + { + "epoch": 1.3865920366365603, + "grad_norm": 1.6274861097335815, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.878652811050415, + "num_tokens": 416029312.0, + "step": 10900 + }, + { + "epoch": 1.3867192469151508, + "grad_norm": 1.4361025094985962, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8828986883163452, + "num_tokens": 416070361.0, + "step": 10901 + }, + { + "epoch": 1.3868464571937413, + "grad_norm": 1.4225482940673828, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8845263719558716, + "num_tokens": 416110341.0, + "step": 10902 + }, + { + "epoch": 1.3869736674723319, + "grad_norm": 1.6549229621887207, + "learning_rate": 1e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.8968508243560791, + "num_tokens": 416141537.0, + "step": 10903 + }, + { + "epoch": 1.3871008777509224, + "grad_norm": 1.6380832195281982, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8647720813751221, + "num_tokens": 416181144.0, + "step": 10904 + }, + { + "epoch": 1.387228088029513, + "grad_norm": 1.573534607887268, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8673450946807861, + "num_tokens": 416223150.0, + "step": 10905 + }, + { + "epoch": 1.3873552983081032, + "grad_norm": 1.493230938911438, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8722490072250366, + "num_tokens": 416261871.0, + "step": 10906 + }, + { + "epoch": 1.3874825085866938, + "grad_norm": 1.4883719682693481, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.879793107509613, + "num_tokens": 416298376.0, + "step": 10907 + }, + { + "epoch": 1.3876097188652843, + "grad_norm": 1.5014945268630981, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8709738254547119, + "num_tokens": 416341096.0, + "step": 10908 + }, + { + "epoch": 1.3877369291438748, + "grad_norm": 1.590457797050476, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8786849975585938, + "num_tokens": 416375950.0, + "step": 10909 + }, + { + "epoch": 1.3878641394224653, + "grad_norm": 1.5730602741241455, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8850771188735962, + "num_tokens": 416409255.0, + "step": 10910 + }, + { + "epoch": 1.3879913497010559, + "grad_norm": 1.5319223403930664, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8791492581367493, + "num_tokens": 416445510.0, + "step": 10911 + }, + { + "epoch": 1.3881185599796464, + "grad_norm": 1.5707968473434448, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8670151829719543, + "num_tokens": 416483253.0, + "step": 10912 + }, + { + "epoch": 1.388245770258237, + "grad_norm": 1.4200834035873413, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8935060501098633, + "num_tokens": 416523520.0, + "step": 10913 + }, + { + "epoch": 1.3883729805368275, + "grad_norm": 1.5099836587905884, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8694940209388733, + "num_tokens": 416561905.0, + "step": 10914 + }, + { + "epoch": 1.3885001908154178, + "grad_norm": 1.5472986698150635, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8759492635726929, + "num_tokens": 416600236.0, + "step": 10915 + }, + { + "epoch": 1.3886274010940083, + "grad_norm": 1.5686352252960205, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8783980011940002, + "num_tokens": 416633097.0, + "step": 10916 + }, + { + "epoch": 1.3887546113725988, + "grad_norm": 1.5601028203964233, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8785980939865112, + "num_tokens": 416669850.0, + "step": 10917 + }, + { + "epoch": 1.3888818216511893, + "grad_norm": 1.6707446575164795, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8695212006568909, + "num_tokens": 416706060.0, + "step": 10918 + }, + { + "epoch": 1.3890090319297799, + "grad_norm": 1.5215644836425781, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8707523345947266, + "num_tokens": 416747710.0, + "step": 10919 + }, + { + "epoch": 1.3891362422083704, + "grad_norm": 1.5778204202651978, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8731746673583984, + "num_tokens": 416783352.0, + "step": 10920 + }, + { + "epoch": 1.389263452486961, + "grad_norm": 1.5879182815551758, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8685851097106934, + "num_tokens": 416824908.0, + "step": 10921 + }, + { + "epoch": 1.3893906627655515, + "grad_norm": 1.5097309350967407, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8830398321151733, + "num_tokens": 416867116.0, + "step": 10922 + }, + { + "epoch": 1.389517873044142, + "grad_norm": 1.4580540657043457, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8785572052001953, + "num_tokens": 416908352.0, + "step": 10923 + }, + { + "epoch": 1.3896450833227325, + "grad_norm": 1.5929949283599854, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8495190739631653, + "num_tokens": 416947717.0, + "step": 10924 + }, + { + "epoch": 1.389772293601323, + "grad_norm": 1.6119920015335083, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.880614697933197, + "num_tokens": 416981063.0, + "step": 10925 + }, + { + "epoch": 1.3898995038799136, + "grad_norm": 1.6854811906814575, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8744058609008789, + "num_tokens": 417017985.0, + "step": 10926 + }, + { + "epoch": 1.390026714158504, + "grad_norm": 1.6885418891906738, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8801345825195312, + "num_tokens": 417049879.0, + "step": 10927 + }, + { + "epoch": 1.3901539244370946, + "grad_norm": 1.6166627407073975, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8748836517333984, + "num_tokens": 417087725.0, + "step": 10928 + }, + { + "epoch": 1.3902811347156852, + "grad_norm": 1.5228227376937866, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8765486478805542, + "num_tokens": 417126152.0, + "step": 10929 + }, + { + "epoch": 1.3904083449942755, + "grad_norm": 1.7197579145431519, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8661096096038818, + "num_tokens": 417165469.0, + "step": 10930 + }, + { + "epoch": 1.390535555272866, + "grad_norm": 1.4347859621047974, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8774949312210083, + "num_tokens": 417204454.0, + "step": 10931 + }, + { + "epoch": 1.3906627655514565, + "grad_norm": 1.7316854000091553, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8514146208763123, + "num_tokens": 417245747.0, + "step": 10932 + }, + { + "epoch": 1.390789975830047, + "grad_norm": 1.5056278705596924, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8824066519737244, + "num_tokens": 417281969.0, + "step": 10933 + }, + { + "epoch": 1.3909171861086376, + "grad_norm": 1.5854274034500122, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8740465044975281, + "num_tokens": 417320577.0, + "step": 10934 + }, + { + "epoch": 1.391044396387228, + "grad_norm": 1.5426273345947266, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8574193716049194, + "num_tokens": 417359088.0, + "step": 10935 + }, + { + "epoch": 1.3911716066658186, + "grad_norm": 1.4891624450683594, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8942264318466187, + "num_tokens": 417393535.0, + "step": 10936 + }, + { + "epoch": 1.3912988169444092, + "grad_norm": 1.544459342956543, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8687930703163147, + "num_tokens": 417432610.0, + "step": 10937 + }, + { + "epoch": 1.3914260272229997, + "grad_norm": 1.5540262460708618, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8666465282440186, + "num_tokens": 417475207.0, + "step": 10938 + }, + { + "epoch": 1.39155323750159, + "grad_norm": 1.4939944744110107, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8682920336723328, + "num_tokens": 417516515.0, + "step": 10939 + }, + { + "epoch": 1.3916804477801805, + "grad_norm": 1.4315519332885742, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8936550617218018, + "num_tokens": 417554989.0, + "step": 10940 + }, + { + "epoch": 1.391807658058771, + "grad_norm": 1.5797127485275269, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8634330034255981, + "num_tokens": 417591137.0, + "step": 10941 + }, + { + "epoch": 1.3919348683373616, + "grad_norm": 1.5646072626113892, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8690961599349976, + "num_tokens": 417626692.0, + "step": 10942 + }, + { + "epoch": 1.392062078615952, + "grad_norm": 1.418649435043335, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.895715594291687, + "num_tokens": 417663424.0, + "step": 10943 + }, + { + "epoch": 1.3921892888945426, + "grad_norm": 1.546212077140808, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.870806872844696, + "num_tokens": 417700302.0, + "step": 10944 + }, + { + "epoch": 1.3923164991731332, + "grad_norm": 1.5151338577270508, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8887029886245728, + "num_tokens": 417735221.0, + "step": 10945 + }, + { + "epoch": 1.3924437094517237, + "grad_norm": 1.5669561624526978, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8778985738754272, + "num_tokens": 417769629.0, + "step": 10946 + }, + { + "epoch": 1.3925709197303142, + "grad_norm": 1.4156372547149658, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8723366856575012, + "num_tokens": 417810071.0, + "step": 10947 + }, + { + "epoch": 1.3926981300089047, + "grad_norm": 1.4854319095611572, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8873963356018066, + "num_tokens": 417849111.0, + "step": 10948 + }, + { + "epoch": 1.3928253402874953, + "grad_norm": 1.5025303363800049, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8803250789642334, + "num_tokens": 417883795.0, + "step": 10949 + }, + { + "epoch": 1.3929525505660858, + "grad_norm": 1.470117449760437, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8920891284942627, + "num_tokens": 417918976.0, + "step": 10950 + }, + { + "epoch": 1.3930797608446763, + "grad_norm": 1.575753092765808, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8744484186172485, + "num_tokens": 417955591.0, + "step": 10951 + }, + { + "epoch": 1.3932069711232669, + "grad_norm": 1.5329123735427856, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8611371517181396, + "num_tokens": 417995149.0, + "step": 10952 + }, + { + "epoch": 1.3933341814018574, + "grad_norm": 1.4175649881362915, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.8952767252922058, + "num_tokens": 418035954.0, + "step": 10953 + }, + { + "epoch": 1.393461391680448, + "grad_norm": 1.5492863655090332, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8740484714508057, + "num_tokens": 418072152.0, + "step": 10954 + }, + { + "epoch": 1.3935886019590382, + "grad_norm": 1.6160681247711182, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8680400848388672, + "num_tokens": 418108165.0, + "step": 10955 + }, + { + "epoch": 1.3937158122376287, + "grad_norm": 1.4409865140914917, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8794928789138794, + "num_tokens": 418148032.0, + "step": 10956 + }, + { + "epoch": 1.3938430225162193, + "grad_norm": 1.4365736246109009, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8762553930282593, + "num_tokens": 418188708.0, + "step": 10957 + }, + { + "epoch": 1.3939702327948098, + "grad_norm": 1.4314866065979004, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.873092770576477, + "num_tokens": 418232563.0, + "step": 10958 + }, + { + "epoch": 1.3940974430734003, + "grad_norm": 1.6587727069854736, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.882166862487793, + "num_tokens": 418266588.0, + "step": 10959 + }, + { + "epoch": 1.3942246533519909, + "grad_norm": 1.4653406143188477, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8681590557098389, + "num_tokens": 418307920.0, + "step": 10960 + }, + { + "epoch": 1.3943518636305814, + "grad_norm": 1.5955173969268799, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8792809844017029, + "num_tokens": 418341122.0, + "step": 10961 + }, + { + "epoch": 1.394479073909172, + "grad_norm": 1.5028470754623413, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8706578612327576, + "num_tokens": 418380676.0, + "step": 10962 + }, + { + "epoch": 1.3946062841877624, + "grad_norm": 1.4196207523345947, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8813674449920654, + "num_tokens": 418421587.0, + "step": 10963 + }, + { + "epoch": 1.3947334944663528, + "grad_norm": 1.4553595781326294, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8809173703193665, + "num_tokens": 418464010.0, + "step": 10964 + }, + { + "epoch": 1.3948607047449433, + "grad_norm": 1.4937596321105957, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8786513805389404, + "num_tokens": 418504448.0, + "step": 10965 + }, + { + "epoch": 1.3949879150235338, + "grad_norm": 1.2954610586166382, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8908736705780029, + "num_tokens": 418549344.0, + "step": 10966 + }, + { + "epoch": 1.3951151253021243, + "grad_norm": 1.4402875900268555, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8840863704681396, + "num_tokens": 418588505.0, + "step": 10967 + }, + { + "epoch": 1.3952423355807149, + "grad_norm": 1.552278757095337, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8575595617294312, + "num_tokens": 418628671.0, + "step": 10968 + }, + { + "epoch": 1.3953695458593054, + "grad_norm": 1.526529312133789, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8874787092208862, + "num_tokens": 418665475.0, + "step": 10969 + }, + { + "epoch": 1.395496756137896, + "grad_norm": 1.5964925289154053, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.875311017036438, + "num_tokens": 418700159.0, + "step": 10970 + }, + { + "epoch": 1.3956239664164865, + "grad_norm": 1.3892862796783447, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8706690073013306, + "num_tokens": 418744819.0, + "step": 10971 + }, + { + "epoch": 1.395751176695077, + "grad_norm": 1.4748135805130005, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8707431554794312, + "num_tokens": 418785992.0, + "step": 10972 + }, + { + "epoch": 1.3958783869736675, + "grad_norm": 1.3794851303100586, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.88182532787323, + "num_tokens": 418826905.0, + "step": 10973 + }, + { + "epoch": 1.396005597252258, + "grad_norm": 1.6567529439926147, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8691476583480835, + "num_tokens": 418858726.0, + "step": 10974 + }, + { + "epoch": 1.3961328075308486, + "grad_norm": 1.5423717498779297, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8764237761497498, + "num_tokens": 418893863.0, + "step": 10975 + }, + { + "epoch": 1.396260017809439, + "grad_norm": 1.7974483966827393, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8711315989494324, + "num_tokens": 418926545.0, + "step": 10976 + }, + { + "epoch": 1.3963872280880296, + "grad_norm": 1.4978294372558594, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8856092691421509, + "num_tokens": 418961364.0, + "step": 10977 + }, + { + "epoch": 1.3965144383666201, + "grad_norm": 1.5224802494049072, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8784707188606262, + "num_tokens": 418997131.0, + "step": 10978 + }, + { + "epoch": 1.3966416486452105, + "grad_norm": 1.4529359340667725, + "learning_rate": 1e-06, + "loss": 0.2713, + "mean_token_accuracy": 0.8999087810516357, + "num_tokens": 419029215.0, + "step": 10979 + }, + { + "epoch": 1.396768858923801, + "grad_norm": 1.4713689088821411, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8909955620765686, + "num_tokens": 419068199.0, + "step": 10980 + }, + { + "epoch": 1.3968960692023915, + "grad_norm": 1.488120675086975, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8612709641456604, + "num_tokens": 419109529.0, + "step": 10981 + }, + { + "epoch": 1.397023279480982, + "grad_norm": 1.3451937437057495, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8912185430526733, + "num_tokens": 419146534.0, + "step": 10982 + }, + { + "epoch": 1.3971504897595726, + "grad_norm": 1.609960675239563, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8741458654403687, + "num_tokens": 419180680.0, + "step": 10983 + }, + { + "epoch": 1.397277700038163, + "grad_norm": 1.5159244537353516, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8765158653259277, + "num_tokens": 419222482.0, + "step": 10984 + }, + { + "epoch": 1.3974049103167536, + "grad_norm": 1.4479058980941772, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.873359203338623, + "num_tokens": 419265165.0, + "step": 10985 + }, + { + "epoch": 1.3975321205953442, + "grad_norm": 1.5312716960906982, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8676875233650208, + "num_tokens": 419305817.0, + "step": 10986 + }, + { + "epoch": 1.3976593308739347, + "grad_norm": 1.6264491081237793, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8748172521591187, + "num_tokens": 419340228.0, + "step": 10987 + }, + { + "epoch": 1.397786541152525, + "grad_norm": 1.5154191255569458, + "learning_rate": 1e-06, + "loss": 0.2825, + "mean_token_accuracy": 0.8980492353439331, + "num_tokens": 419376658.0, + "step": 10988 + }, + { + "epoch": 1.3979137514311155, + "grad_norm": 1.4454129934310913, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8816744685173035, + "num_tokens": 419420421.0, + "step": 10989 + }, + { + "epoch": 1.398040961709706, + "grad_norm": 1.5007296800613403, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8812428116798401, + "num_tokens": 419461237.0, + "step": 10990 + }, + { + "epoch": 1.3981681719882966, + "grad_norm": 1.5920946598052979, + "learning_rate": 1e-06, + "loss": 0.283, + "mean_token_accuracy": 0.8954610824584961, + "num_tokens": 419493188.0, + "step": 10991 + }, + { + "epoch": 1.398295382266887, + "grad_norm": 1.5217598676681519, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8619481325149536, + "num_tokens": 419535338.0, + "step": 10992 + }, + { + "epoch": 1.3984225925454776, + "grad_norm": 1.669198751449585, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8715566396713257, + "num_tokens": 419569284.0, + "step": 10993 + }, + { + "epoch": 1.3985498028240682, + "grad_norm": 1.3032599687576294, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8932891488075256, + "num_tokens": 419614014.0, + "step": 10994 + }, + { + "epoch": 1.3986770131026587, + "grad_norm": 1.6397854089736938, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8625152707099915, + "num_tokens": 419651833.0, + "step": 10995 + }, + { + "epoch": 1.3988042233812492, + "grad_norm": 1.6093825101852417, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8685197830200195, + "num_tokens": 419687726.0, + "step": 10996 + }, + { + "epoch": 1.3989314336598397, + "grad_norm": 1.504165530204773, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8879673480987549, + "num_tokens": 419723262.0, + "step": 10997 + }, + { + "epoch": 1.3990586439384303, + "grad_norm": 1.5022516250610352, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8707032203674316, + "num_tokens": 419760994.0, + "step": 10998 + }, + { + "epoch": 1.3991858542170208, + "grad_norm": 1.3656566143035889, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8923909068107605, + "num_tokens": 419800701.0, + "step": 10999 + }, + { + "epoch": 1.3993130644956113, + "grad_norm": 1.3856890201568604, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8946936726570129, + "num_tokens": 419841688.0, + "step": 11000 + }, + { + "epoch": 1.3994402747742019, + "grad_norm": 1.5298184156417847, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8787133693695068, + "num_tokens": 419878022.0, + "step": 11001 + }, + { + "epoch": 1.3995674850527924, + "grad_norm": 1.4986318349838257, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8664148449897766, + "num_tokens": 419915586.0, + "step": 11002 + }, + { + "epoch": 1.399694695331383, + "grad_norm": 1.4133951663970947, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8786731958389282, + "num_tokens": 419956218.0, + "step": 11003 + }, + { + "epoch": 1.3998219056099732, + "grad_norm": 1.596009373664856, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8726588487625122, + "num_tokens": 419992599.0, + "step": 11004 + }, + { + "epoch": 1.3999491158885637, + "grad_norm": 1.470301866531372, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8608264923095703, + "num_tokens": 420036605.0, + "step": 11005 + }, + { + "epoch": 1.4000763261671543, + "grad_norm": 1.5908151865005493, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8745847940444946, + "num_tokens": 420073827.0, + "step": 11006 + }, + { + "epoch": 1.4002035364457448, + "grad_norm": 1.3903087377548218, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8756435513496399, + "num_tokens": 420119793.0, + "step": 11007 + }, + { + "epoch": 1.4003307467243353, + "grad_norm": 1.5216482877731323, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.864377498626709, + "num_tokens": 420155933.0, + "step": 11008 + }, + { + "epoch": 1.4004579570029259, + "grad_norm": 1.4752980470657349, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8777677416801453, + "num_tokens": 420196288.0, + "step": 11009 + }, + { + "epoch": 1.4005851672815164, + "grad_norm": 1.626050353050232, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8738126754760742, + "num_tokens": 420229297.0, + "step": 11010 + }, + { + "epoch": 1.400712377560107, + "grad_norm": 1.6259921789169312, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8673937320709229, + "num_tokens": 420265064.0, + "step": 11011 + }, + { + "epoch": 1.4008395878386974, + "grad_norm": 1.6427630186080933, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8704326748847961, + "num_tokens": 420300786.0, + "step": 11012 + }, + { + "epoch": 1.4009667981172877, + "grad_norm": 1.5982868671417236, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8796179294586182, + "num_tokens": 420335963.0, + "step": 11013 + }, + { + "epoch": 1.4010940083958783, + "grad_norm": 1.4345957040786743, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8755735158920288, + "num_tokens": 420378396.0, + "step": 11014 + }, + { + "epoch": 1.4012212186744688, + "grad_norm": 1.4601536989212036, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8741222023963928, + "num_tokens": 420423649.0, + "step": 11015 + }, + { + "epoch": 1.4013484289530593, + "grad_norm": 1.6920032501220703, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8704802989959717, + "num_tokens": 420457706.0, + "step": 11016 + }, + { + "epoch": 1.4014756392316499, + "grad_norm": 1.573134183883667, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8783183693885803, + "num_tokens": 420493421.0, + "step": 11017 + }, + { + "epoch": 1.4016028495102404, + "grad_norm": 1.485996127128601, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8735079765319824, + "num_tokens": 420533510.0, + "step": 11018 + }, + { + "epoch": 1.401730059788831, + "grad_norm": 1.4197505712509155, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8779439926147461, + "num_tokens": 420574633.0, + "step": 11019 + }, + { + "epoch": 1.4018572700674214, + "grad_norm": 1.5395411252975464, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8734340667724609, + "num_tokens": 420612117.0, + "step": 11020 + }, + { + "epoch": 1.401984480346012, + "grad_norm": 1.500875473022461, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8772667050361633, + "num_tokens": 420648645.0, + "step": 11021 + }, + { + "epoch": 1.4021116906246025, + "grad_norm": 1.506351351737976, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8793282508850098, + "num_tokens": 420687776.0, + "step": 11022 + }, + { + "epoch": 1.402238900903193, + "grad_norm": 1.545125126838684, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8687800765037537, + "num_tokens": 420725266.0, + "step": 11023 + }, + { + "epoch": 1.4023661111817836, + "grad_norm": 1.5455487966537476, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8746589422225952, + "num_tokens": 420761600.0, + "step": 11024 + }, + { + "epoch": 1.402493321460374, + "grad_norm": 1.4081703424453735, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8823865056037903, + "num_tokens": 420801748.0, + "step": 11025 + }, + { + "epoch": 1.4026205317389646, + "grad_norm": 1.4436328411102295, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8829297423362732, + "num_tokens": 420841519.0, + "step": 11026 + }, + { + "epoch": 1.4027477420175551, + "grad_norm": 1.7234596014022827, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8796660304069519, + "num_tokens": 420873431.0, + "step": 11027 + }, + { + "epoch": 1.4028749522961454, + "grad_norm": 1.6592941284179688, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8713092803955078, + "num_tokens": 420904149.0, + "step": 11028 + }, + { + "epoch": 1.403002162574736, + "grad_norm": 1.5023491382598877, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8716080784797668, + "num_tokens": 420945429.0, + "step": 11029 + }, + { + "epoch": 1.4031293728533265, + "grad_norm": 1.4609854221343994, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8766421675682068, + "num_tokens": 420983312.0, + "step": 11030 + }, + { + "epoch": 1.403256583131917, + "grad_norm": 1.5068198442459106, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8719649910926819, + "num_tokens": 421024836.0, + "step": 11031 + }, + { + "epoch": 1.4033837934105076, + "grad_norm": 1.5186065435409546, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8796392679214478, + "num_tokens": 421065979.0, + "step": 11032 + }, + { + "epoch": 1.403511003689098, + "grad_norm": 1.543256163597107, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8793396949768066, + "num_tokens": 421104905.0, + "step": 11033 + }, + { + "epoch": 1.4036382139676886, + "grad_norm": 1.3935925960540771, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8864909410476685, + "num_tokens": 421146238.0, + "step": 11034 + }, + { + "epoch": 1.4037654242462791, + "grad_norm": 1.5207146406173706, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8771383166313171, + "num_tokens": 421186413.0, + "step": 11035 + }, + { + "epoch": 1.4038926345248697, + "grad_norm": 1.487935185432434, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8875538110733032, + "num_tokens": 421223181.0, + "step": 11036 + }, + { + "epoch": 1.40401984480346, + "grad_norm": 1.7921011447906494, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8540646433830261, + "num_tokens": 421255822.0, + "step": 11037 + }, + { + "epoch": 1.4041470550820505, + "grad_norm": 1.5583707094192505, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8695003986358643, + "num_tokens": 421293589.0, + "step": 11038 + }, + { + "epoch": 1.404274265360641, + "grad_norm": 1.5729215145111084, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8740493655204773, + "num_tokens": 421330855.0, + "step": 11039 + }, + { + "epoch": 1.4044014756392316, + "grad_norm": 1.5074025392532349, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8700054883956909, + "num_tokens": 421370071.0, + "step": 11040 + }, + { + "epoch": 1.404528685917822, + "grad_norm": 1.560508370399475, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8757873177528381, + "num_tokens": 421406324.0, + "step": 11041 + }, + { + "epoch": 1.4046558961964126, + "grad_norm": 1.5182976722717285, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8826490640640259, + "num_tokens": 421442762.0, + "step": 11042 + }, + { + "epoch": 1.4047831064750032, + "grad_norm": 1.5391958951950073, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8899989724159241, + "num_tokens": 421477963.0, + "step": 11043 + }, + { + "epoch": 1.4049103167535937, + "grad_norm": 1.731087565422058, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8842982053756714, + "num_tokens": 421504868.0, + "step": 11044 + }, + { + "epoch": 1.4050375270321842, + "grad_norm": 1.4988819360733032, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8873445987701416, + "num_tokens": 421542157.0, + "step": 11045 + }, + { + "epoch": 1.4051647373107747, + "grad_norm": 1.4459114074707031, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8665357232093811, + "num_tokens": 421588259.0, + "step": 11046 + }, + { + "epoch": 1.4052919475893653, + "grad_norm": 1.5320957899093628, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8773260116577148, + "num_tokens": 421626867.0, + "step": 11047 + }, + { + "epoch": 1.4054191578679558, + "grad_norm": 1.485349178314209, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8713548183441162, + "num_tokens": 421670001.0, + "step": 11048 + }, + { + "epoch": 1.4055463681465463, + "grad_norm": 1.5550981760025024, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8830066919326782, + "num_tokens": 421703933.0, + "step": 11049 + }, + { + "epoch": 1.4056735784251368, + "grad_norm": 1.5255509614944458, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8763546943664551, + "num_tokens": 421741022.0, + "step": 11050 + }, + { + "epoch": 1.4058007887037274, + "grad_norm": 1.6081831455230713, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8728573322296143, + "num_tokens": 421777932.0, + "step": 11051 + }, + { + "epoch": 1.405927998982318, + "grad_norm": 1.4602693319320679, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8880199193954468, + "num_tokens": 421817225.0, + "step": 11052 + }, + { + "epoch": 1.4060552092609082, + "grad_norm": 1.4926905632019043, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8882817029953003, + "num_tokens": 421855226.0, + "step": 11053 + }, + { + "epoch": 1.4061824195394987, + "grad_norm": 1.5109843015670776, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.878409743309021, + "num_tokens": 421892157.0, + "step": 11054 + }, + { + "epoch": 1.4063096298180893, + "grad_norm": 1.5259531736373901, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.873033881187439, + "num_tokens": 421930940.0, + "step": 11055 + }, + { + "epoch": 1.4064368400966798, + "grad_norm": 1.4974901676177979, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8624138236045837, + "num_tokens": 421973133.0, + "step": 11056 + }, + { + "epoch": 1.4065640503752703, + "grad_norm": 1.4608949422836304, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8905391097068787, + "num_tokens": 422008897.0, + "step": 11057 + }, + { + "epoch": 1.4066912606538609, + "grad_norm": 1.5290330648422241, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8640328645706177, + "num_tokens": 422047280.0, + "step": 11058 + }, + { + "epoch": 1.4068184709324514, + "grad_norm": 1.5475213527679443, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8797404170036316, + "num_tokens": 422082541.0, + "step": 11059 + }, + { + "epoch": 1.406945681211042, + "grad_norm": 1.514140248298645, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8807841539382935, + "num_tokens": 422118703.0, + "step": 11060 + }, + { + "epoch": 1.4070728914896324, + "grad_norm": 1.4900364875793457, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.879323422908783, + "num_tokens": 422157395.0, + "step": 11061 + }, + { + "epoch": 1.4072001017682227, + "grad_norm": 1.4478631019592285, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8760786652565002, + "num_tokens": 422200255.0, + "step": 11062 + }, + { + "epoch": 1.4073273120468133, + "grad_norm": 1.4767706394195557, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8772916793823242, + "num_tokens": 422239233.0, + "step": 11063 + }, + { + "epoch": 1.4074545223254038, + "grad_norm": 1.6409590244293213, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8735290169715881, + "num_tokens": 422271379.0, + "step": 11064 + }, + { + "epoch": 1.4075817326039943, + "grad_norm": 1.521484613418579, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8881919980049133, + "num_tokens": 422305533.0, + "step": 11065 + }, + { + "epoch": 1.4077089428825849, + "grad_norm": 1.551558017730713, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8668949604034424, + "num_tokens": 422343469.0, + "step": 11066 + }, + { + "epoch": 1.4078361531611754, + "grad_norm": 1.5930238962173462, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8656500577926636, + "num_tokens": 422381154.0, + "step": 11067 + }, + { + "epoch": 1.407963363439766, + "grad_norm": 1.384623646736145, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8728037476539612, + "num_tokens": 422423816.0, + "step": 11068 + }, + { + "epoch": 1.4080905737183564, + "grad_norm": 1.495482325553894, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8700878620147705, + "num_tokens": 422460427.0, + "step": 11069 + }, + { + "epoch": 1.408217783996947, + "grad_norm": 1.441015362739563, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8710296154022217, + "num_tokens": 422501517.0, + "step": 11070 + }, + { + "epoch": 1.4083449942755375, + "grad_norm": 1.5140575170516968, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.867881178855896, + "num_tokens": 422542657.0, + "step": 11071 + }, + { + "epoch": 1.408472204554128, + "grad_norm": 1.5168213844299316, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8705606460571289, + "num_tokens": 422581574.0, + "step": 11072 + }, + { + "epoch": 1.4085994148327186, + "grad_norm": 1.5569177865982056, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8713791370391846, + "num_tokens": 422616781.0, + "step": 11073 + }, + { + "epoch": 1.408726625111309, + "grad_norm": 1.5302867889404297, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8774677515029907, + "num_tokens": 422651556.0, + "step": 11074 + }, + { + "epoch": 1.4088538353898996, + "grad_norm": 1.5213342905044556, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8826421499252319, + "num_tokens": 422687750.0, + "step": 11075 + }, + { + "epoch": 1.4089810456684901, + "grad_norm": 1.6531928777694702, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8778172135353088, + "num_tokens": 422719801.0, + "step": 11076 + }, + { + "epoch": 1.4091082559470804, + "grad_norm": 1.4201812744140625, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8756759166717529, + "num_tokens": 422762459.0, + "step": 11077 + }, + { + "epoch": 1.409235466225671, + "grad_norm": 1.4824026823043823, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8689301609992981, + "num_tokens": 422802303.0, + "step": 11078 + }, + { + "epoch": 1.4093626765042615, + "grad_norm": 1.4256230592727661, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8935302495956421, + "num_tokens": 422837655.0, + "step": 11079 + }, + { + "epoch": 1.409489886782852, + "grad_norm": 1.504845380783081, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8588581085205078, + "num_tokens": 422881956.0, + "step": 11080 + }, + { + "epoch": 1.4096170970614426, + "grad_norm": 1.5718899965286255, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8751216530799866, + "num_tokens": 422914684.0, + "step": 11081 + }, + { + "epoch": 1.409744307340033, + "grad_norm": 1.6215803623199463, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8660295009613037, + "num_tokens": 422952882.0, + "step": 11082 + }, + { + "epoch": 1.4098715176186236, + "grad_norm": 1.3634995222091675, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8862811326980591, + "num_tokens": 422997165.0, + "step": 11083 + }, + { + "epoch": 1.4099987278972141, + "grad_norm": 1.3811532258987427, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8782203793525696, + "num_tokens": 423042295.0, + "step": 11084 + }, + { + "epoch": 1.4101259381758047, + "grad_norm": 1.4512102603912354, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8870540857315063, + "num_tokens": 423080855.0, + "step": 11085 + }, + { + "epoch": 1.410253148454395, + "grad_norm": 1.5302159786224365, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.87126624584198, + "num_tokens": 423119977.0, + "step": 11086 + }, + { + "epoch": 1.4103803587329855, + "grad_norm": 1.3619763851165771, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.87884521484375, + "num_tokens": 423164557.0, + "step": 11087 + }, + { + "epoch": 1.410507569011576, + "grad_norm": 1.4573091268539429, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8765264749526978, + "num_tokens": 423205838.0, + "step": 11088 + }, + { + "epoch": 1.4106347792901666, + "grad_norm": 1.6076668500900269, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8890954256057739, + "num_tokens": 423236192.0, + "step": 11089 + }, + { + "epoch": 1.410761989568757, + "grad_norm": 1.4120161533355713, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8698620200157166, + "num_tokens": 423276768.0, + "step": 11090 + }, + { + "epoch": 1.4108891998473476, + "grad_norm": 1.4882619380950928, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8765283823013306, + "num_tokens": 423315232.0, + "step": 11091 + }, + { + "epoch": 1.4110164101259381, + "grad_norm": 1.5835561752319336, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8763564229011536, + "num_tokens": 423349399.0, + "step": 11092 + }, + { + "epoch": 1.4111436204045287, + "grad_norm": 1.7158706188201904, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8777246475219727, + "num_tokens": 423380224.0, + "step": 11093 + }, + { + "epoch": 1.4112708306831192, + "grad_norm": 1.458168864250183, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8797228336334229, + "num_tokens": 423416514.0, + "step": 11094 + }, + { + "epoch": 1.4113980409617097, + "grad_norm": 1.5896782875061035, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8784003853797913, + "num_tokens": 423454383.0, + "step": 11095 + }, + { + "epoch": 1.4115252512403003, + "grad_norm": 1.478225827217102, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8809967637062073, + "num_tokens": 423491270.0, + "step": 11096 + }, + { + "epoch": 1.4116524615188908, + "grad_norm": 1.5546330213546753, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8804792761802673, + "num_tokens": 423524656.0, + "step": 11097 + }, + { + "epoch": 1.4117796717974813, + "grad_norm": 1.3788225650787354, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8896254897117615, + "num_tokens": 423564845.0, + "step": 11098 + }, + { + "epoch": 1.4119068820760718, + "grad_norm": 1.6624479293823242, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8805198669433594, + "num_tokens": 423599590.0, + "step": 11099 + }, + { + "epoch": 1.4120340923546624, + "grad_norm": 1.4883733987808228, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8852792978286743, + "num_tokens": 423634966.0, + "step": 11100 + }, + { + "epoch": 1.412161302633253, + "grad_norm": 1.586666464805603, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8625847697257996, + "num_tokens": 423674808.0, + "step": 11101 + }, + { + "epoch": 1.4122885129118432, + "grad_norm": 1.559429407119751, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8747735619544983, + "num_tokens": 423710995.0, + "step": 11102 + }, + { + "epoch": 1.4124157231904337, + "grad_norm": 1.366215467453003, + "learning_rate": 1e-06, + "loss": 0.2802, + "mean_token_accuracy": 0.8983334898948669, + "num_tokens": 423751404.0, + "step": 11103 + }, + { + "epoch": 1.4125429334690243, + "grad_norm": 1.5015326738357544, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8773070573806763, + "num_tokens": 423789964.0, + "step": 11104 + }, + { + "epoch": 1.4126701437476148, + "grad_norm": 1.5228651762008667, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8865405321121216, + "num_tokens": 423826871.0, + "step": 11105 + }, + { + "epoch": 1.4127973540262053, + "grad_norm": 1.7068374156951904, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8824964165687561, + "num_tokens": 423859591.0, + "step": 11106 + }, + { + "epoch": 1.4129245643047958, + "grad_norm": 1.5376019477844238, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8790072798728943, + "num_tokens": 423897999.0, + "step": 11107 + }, + { + "epoch": 1.4130517745833864, + "grad_norm": 1.6865692138671875, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8658866882324219, + "num_tokens": 423930799.0, + "step": 11108 + }, + { + "epoch": 1.413178984861977, + "grad_norm": 1.5307105779647827, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8728455305099487, + "num_tokens": 423972180.0, + "step": 11109 + }, + { + "epoch": 1.4133061951405674, + "grad_norm": 1.5548932552337646, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8764487504959106, + "num_tokens": 424010341.0, + "step": 11110 + }, + { + "epoch": 1.4134334054191577, + "grad_norm": 1.644174337387085, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8726978898048401, + "num_tokens": 424044441.0, + "step": 11111 + }, + { + "epoch": 1.4135606156977483, + "grad_norm": 1.724001169204712, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8657968640327454, + "num_tokens": 424079710.0, + "step": 11112 + }, + { + "epoch": 1.4136878259763388, + "grad_norm": 1.5371065139770508, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8723502159118652, + "num_tokens": 424116155.0, + "step": 11113 + }, + { + "epoch": 1.4138150362549293, + "grad_norm": 1.5447561740875244, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8810619115829468, + "num_tokens": 424152952.0, + "step": 11114 + }, + { + "epoch": 1.4139422465335199, + "grad_norm": 1.5218859910964966, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.880907416343689, + "num_tokens": 424191144.0, + "step": 11115 + }, + { + "epoch": 1.4140694568121104, + "grad_norm": 1.5149894952774048, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8720473051071167, + "num_tokens": 424226889.0, + "step": 11116 + }, + { + "epoch": 1.414196667090701, + "grad_norm": 1.4559829235076904, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8719449043273926, + "num_tokens": 424270123.0, + "step": 11117 + }, + { + "epoch": 1.4143238773692914, + "grad_norm": 1.3518038988113403, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8767005205154419, + "num_tokens": 424313593.0, + "step": 11118 + }, + { + "epoch": 1.414451087647882, + "grad_norm": 1.364017367362976, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8893540501594543, + "num_tokens": 424351779.0, + "step": 11119 + }, + { + "epoch": 1.4145782979264725, + "grad_norm": 1.5770930051803589, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8697737455368042, + "num_tokens": 424386050.0, + "step": 11120 + }, + { + "epoch": 1.414705508205063, + "grad_norm": 1.6071085929870605, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8682006001472473, + "num_tokens": 424418914.0, + "step": 11121 + }, + { + "epoch": 1.4148327184836536, + "grad_norm": 1.5814969539642334, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8814170956611633, + "num_tokens": 424455009.0, + "step": 11122 + }, + { + "epoch": 1.414959928762244, + "grad_norm": 1.4121417999267578, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8793329000473022, + "num_tokens": 424498050.0, + "step": 11123 + }, + { + "epoch": 1.4150871390408346, + "grad_norm": 1.6065071821212769, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8753739595413208, + "num_tokens": 424530993.0, + "step": 11124 + }, + { + "epoch": 1.4152143493194251, + "grad_norm": 1.484302043914795, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8760274648666382, + "num_tokens": 424570223.0, + "step": 11125 + }, + { + "epoch": 1.4153415595980154, + "grad_norm": 1.3903788328170776, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8879127502441406, + "num_tokens": 424612526.0, + "step": 11126 + }, + { + "epoch": 1.415468769876606, + "grad_norm": 1.5490782260894775, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8657945394515991, + "num_tokens": 424651888.0, + "step": 11127 + }, + { + "epoch": 1.4155959801551965, + "grad_norm": 1.426283359527588, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8823047876358032, + "num_tokens": 424690885.0, + "step": 11128 + }, + { + "epoch": 1.415723190433787, + "grad_norm": 1.4635816812515259, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8979290723800659, + "num_tokens": 424724340.0, + "step": 11129 + }, + { + "epoch": 1.4158504007123776, + "grad_norm": 1.492523431777954, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.88837069272995, + "num_tokens": 424761327.0, + "step": 11130 + }, + { + "epoch": 1.415977610990968, + "grad_norm": 1.5637260675430298, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8780461549758911, + "num_tokens": 424798612.0, + "step": 11131 + }, + { + "epoch": 1.4161048212695586, + "grad_norm": 1.3992249965667725, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8749613761901855, + "num_tokens": 424839280.0, + "step": 11132 + }, + { + "epoch": 1.4162320315481491, + "grad_norm": 1.460855484008789, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8683184385299683, + "num_tokens": 424877415.0, + "step": 11133 + }, + { + "epoch": 1.4163592418267397, + "grad_norm": 1.5118834972381592, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8778786659240723, + "num_tokens": 424911822.0, + "step": 11134 + }, + { + "epoch": 1.41648645210533, + "grad_norm": 1.4392763376235962, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8564187288284302, + "num_tokens": 424956144.0, + "step": 11135 + }, + { + "epoch": 1.4166136623839205, + "grad_norm": 1.4505122900009155, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8727319240570068, + "num_tokens": 424998299.0, + "step": 11136 + }, + { + "epoch": 1.416740872662511, + "grad_norm": 1.484815001487732, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8878556489944458, + "num_tokens": 425033052.0, + "step": 11137 + }, + { + "epoch": 1.4168680829411016, + "grad_norm": 1.376409888267517, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8963148593902588, + "num_tokens": 425069721.0, + "step": 11138 + }, + { + "epoch": 1.416995293219692, + "grad_norm": 1.359249472618103, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8931512832641602, + "num_tokens": 425110708.0, + "step": 11139 + }, + { + "epoch": 1.4171225034982826, + "grad_norm": 1.564302921295166, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8709760308265686, + "num_tokens": 425149321.0, + "step": 11140 + }, + { + "epoch": 1.4172497137768731, + "grad_norm": 1.5202378034591675, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8801559209823608, + "num_tokens": 425189084.0, + "step": 11141 + }, + { + "epoch": 1.4173769240554637, + "grad_norm": 1.6303917169570923, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8750589489936829, + "num_tokens": 425233250.0, + "step": 11142 + }, + { + "epoch": 1.4175041343340542, + "grad_norm": 1.4993088245391846, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8756378889083862, + "num_tokens": 425271762.0, + "step": 11143 + }, + { + "epoch": 1.4176313446126447, + "grad_norm": 1.4419540166854858, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8851732015609741, + "num_tokens": 425310057.0, + "step": 11144 + }, + { + "epoch": 1.4177585548912353, + "grad_norm": 1.4678685665130615, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8772404789924622, + "num_tokens": 425348327.0, + "step": 11145 + }, + { + "epoch": 1.4178857651698258, + "grad_norm": 1.4804304838180542, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8753242492675781, + "num_tokens": 425385843.0, + "step": 11146 + }, + { + "epoch": 1.4180129754484163, + "grad_norm": 1.7241621017456055, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8671567440032959, + "num_tokens": 425419454.0, + "step": 11147 + }, + { + "epoch": 1.4181401857270068, + "grad_norm": 1.550496220588684, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8755838871002197, + "num_tokens": 425454293.0, + "step": 11148 + }, + { + "epoch": 1.4182673960055974, + "grad_norm": 1.403076410293579, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8795654773712158, + "num_tokens": 425494206.0, + "step": 11149 + }, + { + "epoch": 1.418394606284188, + "grad_norm": 1.4843257665634155, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8782370090484619, + "num_tokens": 425531757.0, + "step": 11150 + }, + { + "epoch": 1.4185218165627782, + "grad_norm": 1.589921474456787, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8679430484771729, + "num_tokens": 425569974.0, + "step": 11151 + }, + { + "epoch": 1.4186490268413687, + "grad_norm": 1.4895167350769043, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8721210360527039, + "num_tokens": 425610319.0, + "step": 11152 + }, + { + "epoch": 1.4187762371199593, + "grad_norm": 1.548255205154419, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8783274292945862, + "num_tokens": 425646109.0, + "step": 11153 + }, + { + "epoch": 1.4189034473985498, + "grad_norm": 1.4185733795166016, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8718665838241577, + "num_tokens": 425689049.0, + "step": 11154 + }, + { + "epoch": 1.4190306576771403, + "grad_norm": 1.5062453746795654, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8841150999069214, + "num_tokens": 425726059.0, + "step": 11155 + }, + { + "epoch": 1.4191578679557308, + "grad_norm": 1.6346746683120728, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8801677227020264, + "num_tokens": 425759280.0, + "step": 11156 + }, + { + "epoch": 1.4192850782343214, + "grad_norm": 1.505228042602539, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8728053569793701, + "num_tokens": 425797932.0, + "step": 11157 + }, + { + "epoch": 1.419412288512912, + "grad_norm": 1.4843711853027344, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8794946074485779, + "num_tokens": 425832192.0, + "step": 11158 + }, + { + "epoch": 1.4195394987915024, + "grad_norm": 1.5650489330291748, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8692394495010376, + "num_tokens": 425869922.0, + "step": 11159 + }, + { + "epoch": 1.4196667090700927, + "grad_norm": 1.5080323219299316, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8712918162345886, + "num_tokens": 425905598.0, + "step": 11160 + }, + { + "epoch": 1.4197939193486833, + "grad_norm": 1.4801150560379028, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8811943531036377, + "num_tokens": 425942852.0, + "step": 11161 + }, + { + "epoch": 1.4199211296272738, + "grad_norm": 1.4828758239746094, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8780791759490967, + "num_tokens": 425982015.0, + "step": 11162 + }, + { + "epoch": 1.4200483399058643, + "grad_norm": 1.5508999824523926, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8664727807044983, + "num_tokens": 426019900.0, + "step": 11163 + }, + { + "epoch": 1.4201755501844548, + "grad_norm": 1.5133779048919678, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8656356930732727, + "num_tokens": 426060720.0, + "step": 11164 + }, + { + "epoch": 1.4203027604630454, + "grad_norm": 1.5079572200775146, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8798561096191406, + "num_tokens": 426101077.0, + "step": 11165 + }, + { + "epoch": 1.420429970741636, + "grad_norm": 1.4917973279953003, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8675509691238403, + "num_tokens": 426139890.0, + "step": 11166 + }, + { + "epoch": 1.4205571810202264, + "grad_norm": 1.5272057056427002, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8750841021537781, + "num_tokens": 426181838.0, + "step": 11167 + }, + { + "epoch": 1.420684391298817, + "grad_norm": 1.4355623722076416, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8682755827903748, + "num_tokens": 426223984.0, + "step": 11168 + }, + { + "epoch": 1.4208116015774075, + "grad_norm": 1.5798392295837402, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8904494047164917, + "num_tokens": 426257622.0, + "step": 11169 + }, + { + "epoch": 1.420938811855998, + "grad_norm": 1.5331010818481445, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.874180257320404, + "num_tokens": 426292796.0, + "step": 11170 + }, + { + "epoch": 1.4210660221345885, + "grad_norm": 1.53549325466156, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8713023066520691, + "num_tokens": 426330101.0, + "step": 11171 + }, + { + "epoch": 1.421193232413179, + "grad_norm": 1.3649159669876099, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8856891989707947, + "num_tokens": 426372545.0, + "step": 11172 + }, + { + "epoch": 1.4213204426917696, + "grad_norm": 1.3995604515075684, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8781365752220154, + "num_tokens": 426415643.0, + "step": 11173 + }, + { + "epoch": 1.4214476529703601, + "grad_norm": 1.4873385429382324, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8792587518692017, + "num_tokens": 426453652.0, + "step": 11174 + }, + { + "epoch": 1.4215748632489504, + "grad_norm": 1.5083413124084473, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8773474097251892, + "num_tokens": 426490911.0, + "step": 11175 + }, + { + "epoch": 1.421702073527541, + "grad_norm": 1.4531558752059937, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8815127611160278, + "num_tokens": 426531305.0, + "step": 11176 + }, + { + "epoch": 1.4218292838061315, + "grad_norm": 1.5302207469940186, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8675386905670166, + "num_tokens": 426569808.0, + "step": 11177 + }, + { + "epoch": 1.421956494084722, + "grad_norm": 1.4418468475341797, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8853206634521484, + "num_tokens": 426604872.0, + "step": 11178 + }, + { + "epoch": 1.4220837043633126, + "grad_norm": 1.5035438537597656, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8626254796981812, + "num_tokens": 426645048.0, + "step": 11179 + }, + { + "epoch": 1.422210914641903, + "grad_norm": 1.5461233854293823, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8661670684814453, + "num_tokens": 426685713.0, + "step": 11180 + }, + { + "epoch": 1.4223381249204936, + "grad_norm": 1.7852282524108887, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8675347566604614, + "num_tokens": 426718659.0, + "step": 11181 + }, + { + "epoch": 1.4224653351990841, + "grad_norm": 1.5701889991760254, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8668745160102844, + "num_tokens": 426758737.0, + "step": 11182 + }, + { + "epoch": 1.4225925454776747, + "grad_norm": 1.5329415798187256, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8772724270820618, + "num_tokens": 426798011.0, + "step": 11183 + }, + { + "epoch": 1.422719755756265, + "grad_norm": 1.4913151264190674, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.866172194480896, + "num_tokens": 426840664.0, + "step": 11184 + }, + { + "epoch": 1.4228469660348555, + "grad_norm": 1.7553462982177734, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.857964277267456, + "num_tokens": 426875658.0, + "step": 11185 + }, + { + "epoch": 1.422974176313446, + "grad_norm": 1.4535841941833496, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8850966691970825, + "num_tokens": 426912755.0, + "step": 11186 + }, + { + "epoch": 1.4231013865920366, + "grad_norm": 1.4496456384658813, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8713007569313049, + "num_tokens": 426951874.0, + "step": 11187 + }, + { + "epoch": 1.423228596870627, + "grad_norm": 1.4772588014602661, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8874317407608032, + "num_tokens": 426986864.0, + "step": 11188 + }, + { + "epoch": 1.4233558071492176, + "grad_norm": 1.6213058233261108, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8789677023887634, + "num_tokens": 427018967.0, + "step": 11189 + }, + { + "epoch": 1.4234830174278081, + "grad_norm": 1.5199109315872192, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8862344026565552, + "num_tokens": 427056556.0, + "step": 11190 + }, + { + "epoch": 1.4236102277063987, + "grad_norm": 1.2957631349563599, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8807017207145691, + "num_tokens": 427101688.0, + "step": 11191 + }, + { + "epoch": 1.4237374379849892, + "grad_norm": 1.4850728511810303, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8939447999000549, + "num_tokens": 427135328.0, + "step": 11192 + }, + { + "epoch": 1.4238646482635797, + "grad_norm": 1.4919430017471313, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8751499652862549, + "num_tokens": 427175487.0, + "step": 11193 + }, + { + "epoch": 1.4239918585421703, + "grad_norm": 1.575743556022644, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8714953660964966, + "num_tokens": 427209591.0, + "step": 11194 + }, + { + "epoch": 1.4241190688207608, + "grad_norm": 1.6274346113204956, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8797018527984619, + "num_tokens": 427242390.0, + "step": 11195 + }, + { + "epoch": 1.4242462790993513, + "grad_norm": 1.4636056423187256, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8855749368667603, + "num_tokens": 427278242.0, + "step": 11196 + }, + { + "epoch": 1.4243734893779418, + "grad_norm": 1.4982097148895264, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8814783096313477, + "num_tokens": 427316117.0, + "step": 11197 + }, + { + "epoch": 1.4245006996565324, + "grad_norm": 1.4297772645950317, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8779842853546143, + "num_tokens": 427355945.0, + "step": 11198 + }, + { + "epoch": 1.424627909935123, + "grad_norm": 1.6308577060699463, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8652924299240112, + "num_tokens": 427390085.0, + "step": 11199 + }, + { + "epoch": 1.4247551202137132, + "grad_norm": 1.4639948606491089, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8794286251068115, + "num_tokens": 427433815.0, + "step": 11200 + }, + { + "epoch": 1.4248823304923037, + "grad_norm": 1.4333131313323975, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8684178590774536, + "num_tokens": 427477401.0, + "step": 11201 + }, + { + "epoch": 1.4250095407708943, + "grad_norm": 1.5553432703018188, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8819919228553772, + "num_tokens": 427513486.0, + "step": 11202 + }, + { + "epoch": 1.4251367510494848, + "grad_norm": 1.3799166679382324, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8832557797431946, + "num_tokens": 427556658.0, + "step": 11203 + }, + { + "epoch": 1.4252639613280753, + "grad_norm": 1.4212907552719116, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8850659728050232, + "num_tokens": 427596997.0, + "step": 11204 + }, + { + "epoch": 1.4253911716066658, + "grad_norm": 1.434395432472229, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8742793202400208, + "num_tokens": 427640030.0, + "step": 11205 + }, + { + "epoch": 1.4255183818852564, + "grad_norm": 1.4858477115631104, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8818842172622681, + "num_tokens": 427676932.0, + "step": 11206 + }, + { + "epoch": 1.425645592163847, + "grad_norm": 1.542782187461853, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8898766040802002, + "num_tokens": 427708591.0, + "step": 11207 + }, + { + "epoch": 1.4257728024424374, + "grad_norm": 1.6155259609222412, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8877679109573364, + "num_tokens": 427741275.0, + "step": 11208 + }, + { + "epoch": 1.4259000127210277, + "grad_norm": 1.5095237493515015, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8671039342880249, + "num_tokens": 427778098.0, + "step": 11209 + }, + { + "epoch": 1.4260272229996183, + "grad_norm": 1.5472885370254517, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8741806745529175, + "num_tokens": 427814580.0, + "step": 11210 + }, + { + "epoch": 1.4261544332782088, + "grad_norm": 1.4463677406311035, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8930525779724121, + "num_tokens": 427855253.0, + "step": 11211 + }, + { + "epoch": 1.4262816435567993, + "grad_norm": 1.5564541816711426, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8766627311706543, + "num_tokens": 427891542.0, + "step": 11212 + }, + { + "epoch": 1.4264088538353898, + "grad_norm": 1.3814152479171753, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8843305110931396, + "num_tokens": 427932693.0, + "step": 11213 + }, + { + "epoch": 1.4265360641139804, + "grad_norm": 1.585107445716858, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.879401445388794, + "num_tokens": 427970154.0, + "step": 11214 + }, + { + "epoch": 1.426663274392571, + "grad_norm": 1.4187394380569458, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8853696584701538, + "num_tokens": 428008750.0, + "step": 11215 + }, + { + "epoch": 1.4267904846711614, + "grad_norm": 1.5151286125183105, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8685205578804016, + "num_tokens": 428046006.0, + "step": 11216 + }, + { + "epoch": 1.426917694949752, + "grad_norm": 1.5650590658187866, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.877017617225647, + "num_tokens": 428080719.0, + "step": 11217 + }, + { + "epoch": 1.4270449052283425, + "grad_norm": 1.525394082069397, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8674336075782776, + "num_tokens": 428117976.0, + "step": 11218 + }, + { + "epoch": 1.427172115506933, + "grad_norm": 1.4760727882385254, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8841539621353149, + "num_tokens": 428155165.0, + "step": 11219 + }, + { + "epoch": 1.4272993257855235, + "grad_norm": 1.4975287914276123, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8740577697753906, + "num_tokens": 428192931.0, + "step": 11220 + }, + { + "epoch": 1.427426536064114, + "grad_norm": 1.4244272708892822, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8920398354530334, + "num_tokens": 428229935.0, + "step": 11221 + }, + { + "epoch": 1.4275537463427046, + "grad_norm": 1.5297695398330688, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8697132468223572, + "num_tokens": 428268575.0, + "step": 11222 + }, + { + "epoch": 1.4276809566212951, + "grad_norm": 1.6470904350280762, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8675660490989685, + "num_tokens": 428301990.0, + "step": 11223 + }, + { + "epoch": 1.4278081668998854, + "grad_norm": 1.4124855995178223, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8878523111343384, + "num_tokens": 428343027.0, + "step": 11224 + }, + { + "epoch": 1.427935377178476, + "grad_norm": 1.5127854347229004, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8676530122756958, + "num_tokens": 428382420.0, + "step": 11225 + }, + { + "epoch": 1.4280625874570665, + "grad_norm": 1.5291752815246582, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8885440826416016, + "num_tokens": 428411204.0, + "step": 11226 + }, + { + "epoch": 1.428189797735657, + "grad_norm": 1.4516096115112305, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8700449466705322, + "num_tokens": 428449755.0, + "step": 11227 + }, + { + "epoch": 1.4283170080142475, + "grad_norm": 1.4625434875488281, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8856200575828552, + "num_tokens": 428486415.0, + "step": 11228 + }, + { + "epoch": 1.428444218292838, + "grad_norm": 1.5871555805206299, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8796263337135315, + "num_tokens": 428521852.0, + "step": 11229 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 1.405872106552124, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8848941326141357, + "num_tokens": 428566363.0, + "step": 11230 + }, + { + "epoch": 1.4286986388500191, + "grad_norm": 1.535327434539795, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.866091251373291, + "num_tokens": 428604265.0, + "step": 11231 + }, + { + "epoch": 1.4288258491286097, + "grad_norm": 1.4707320928573608, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8838213682174683, + "num_tokens": 428643272.0, + "step": 11232 + }, + { + "epoch": 1.4289530594072, + "grad_norm": 1.4418063163757324, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8545569777488708, + "num_tokens": 428687209.0, + "step": 11233 + }, + { + "epoch": 1.4290802696857905, + "grad_norm": 1.5959129333496094, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8778172731399536, + "num_tokens": 428722880.0, + "step": 11234 + }, + { + "epoch": 1.429207479964381, + "grad_norm": 1.3833317756652832, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8833359479904175, + "num_tokens": 428766784.0, + "step": 11235 + }, + { + "epoch": 1.4293346902429716, + "grad_norm": 1.5567867755889893, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8781710863113403, + "num_tokens": 428802856.0, + "step": 11236 + }, + { + "epoch": 1.429461900521562, + "grad_norm": 1.545636534690857, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8812754154205322, + "num_tokens": 428839329.0, + "step": 11237 + }, + { + "epoch": 1.4295891108001526, + "grad_norm": 1.9627071619033813, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8684288263320923, + "num_tokens": 428876080.0, + "step": 11238 + }, + { + "epoch": 1.4297163210787431, + "grad_norm": 1.476245403289795, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8658360838890076, + "num_tokens": 428915665.0, + "step": 11239 + }, + { + "epoch": 1.4298435313573337, + "grad_norm": 1.529648780822754, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8843463659286499, + "num_tokens": 428950117.0, + "step": 11240 + }, + { + "epoch": 1.4299707416359242, + "grad_norm": 1.4177372455596924, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8827370405197144, + "num_tokens": 428992199.0, + "step": 11241 + }, + { + "epoch": 1.4300979519145147, + "grad_norm": 1.4581224918365479, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8840582370758057, + "num_tokens": 429029649.0, + "step": 11242 + }, + { + "epoch": 1.4302251621931052, + "grad_norm": 1.542326807975769, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8623894453048706, + "num_tokens": 429070561.0, + "step": 11243 + }, + { + "epoch": 1.4303523724716958, + "grad_norm": 1.4683893918991089, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8825775384902954, + "num_tokens": 429108579.0, + "step": 11244 + }, + { + "epoch": 1.4304795827502863, + "grad_norm": 1.5283576250076294, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8710290789604187, + "num_tokens": 429146744.0, + "step": 11245 + }, + { + "epoch": 1.4306067930288768, + "grad_norm": 1.5201225280761719, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8880088329315186, + "num_tokens": 429184144.0, + "step": 11246 + }, + { + "epoch": 1.4307340033074674, + "grad_norm": 1.616080403327942, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8875024914741516, + "num_tokens": 429215841.0, + "step": 11247 + }, + { + "epoch": 1.430861213586058, + "grad_norm": 1.5767292976379395, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.879401445388794, + "num_tokens": 429251842.0, + "step": 11248 + }, + { + "epoch": 1.4309884238646482, + "grad_norm": 1.3982107639312744, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8733210563659668, + "num_tokens": 429293470.0, + "step": 11249 + }, + { + "epoch": 1.4311156341432387, + "grad_norm": 1.483774185180664, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8660268783569336, + "num_tokens": 429335070.0, + "step": 11250 + }, + { + "epoch": 1.4312428444218293, + "grad_norm": 1.4962303638458252, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.880686342716217, + "num_tokens": 429374170.0, + "step": 11251 + }, + { + "epoch": 1.4313700547004198, + "grad_norm": 1.5078414678573608, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8647856116294861, + "num_tokens": 429413368.0, + "step": 11252 + }, + { + "epoch": 1.4314972649790103, + "grad_norm": 1.4279707670211792, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8806535005569458, + "num_tokens": 429455010.0, + "step": 11253 + }, + { + "epoch": 1.4316244752576008, + "grad_norm": 1.6058632135391235, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.880511999130249, + "num_tokens": 429493346.0, + "step": 11254 + }, + { + "epoch": 1.4317516855361914, + "grad_norm": 1.4243090152740479, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8787806034088135, + "num_tokens": 429536916.0, + "step": 11255 + }, + { + "epoch": 1.431878895814782, + "grad_norm": 1.4986084699630737, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8947286009788513, + "num_tokens": 429570822.0, + "step": 11256 + }, + { + "epoch": 1.4320061060933724, + "grad_norm": 1.5634815692901611, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.874488353729248, + "num_tokens": 429607779.0, + "step": 11257 + }, + { + "epoch": 1.4321333163719627, + "grad_norm": 1.4609878063201904, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.881908655166626, + "num_tokens": 429650335.0, + "step": 11258 + }, + { + "epoch": 1.4322605266505533, + "grad_norm": 1.5262351036071777, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8776932954788208, + "num_tokens": 429688137.0, + "step": 11259 + }, + { + "epoch": 1.4323877369291438, + "grad_norm": 1.4773259162902832, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8729667663574219, + "num_tokens": 429728387.0, + "step": 11260 + }, + { + "epoch": 1.4325149472077343, + "grad_norm": 1.5127500295639038, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8799672722816467, + "num_tokens": 429766763.0, + "step": 11261 + }, + { + "epoch": 1.4326421574863248, + "grad_norm": 1.5433553457260132, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8761175870895386, + "num_tokens": 429803381.0, + "step": 11262 + }, + { + "epoch": 1.4327693677649154, + "grad_norm": 1.3744856119155884, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8801131248474121, + "num_tokens": 429848461.0, + "step": 11263 + }, + { + "epoch": 1.432896578043506, + "grad_norm": 1.663275122642517, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8728992342948914, + "num_tokens": 429883849.0, + "step": 11264 + }, + { + "epoch": 1.4330237883220964, + "grad_norm": 1.4407685995101929, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8762282133102417, + "num_tokens": 429922917.0, + "step": 11265 + }, + { + "epoch": 1.433150998600687, + "grad_norm": 1.3969106674194336, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8829681873321533, + "num_tokens": 429963878.0, + "step": 11266 + }, + { + "epoch": 1.4332782088792775, + "grad_norm": 1.5977450609207153, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8675981760025024, + "num_tokens": 429998051.0, + "step": 11267 + }, + { + "epoch": 1.433405419157868, + "grad_norm": 1.3999290466308594, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.889499306678772, + "num_tokens": 430037812.0, + "step": 11268 + }, + { + "epoch": 1.4335326294364585, + "grad_norm": 1.5449212789535522, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8944141864776611, + "num_tokens": 430074385.0, + "step": 11269 + }, + { + "epoch": 1.433659839715049, + "grad_norm": 1.5433855056762695, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8782007694244385, + "num_tokens": 430114494.0, + "step": 11270 + }, + { + "epoch": 1.4337870499936396, + "grad_norm": 1.4540605545043945, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8758928775787354, + "num_tokens": 430158912.0, + "step": 11271 + }, + { + "epoch": 1.4339142602722301, + "grad_norm": 1.4823782444000244, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8711370229721069, + "num_tokens": 430198724.0, + "step": 11272 + }, + { + "epoch": 1.4340414705508204, + "grad_norm": 1.4630742073059082, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8718347549438477, + "num_tokens": 430235226.0, + "step": 11273 + }, + { + "epoch": 1.434168680829411, + "grad_norm": 1.396101951599121, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8748600482940674, + "num_tokens": 430278573.0, + "step": 11274 + }, + { + "epoch": 1.4342958911080015, + "grad_norm": 1.5253154039382935, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8791657090187073, + "num_tokens": 430315987.0, + "step": 11275 + }, + { + "epoch": 1.434423101386592, + "grad_norm": 1.4309709072113037, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8701869249343872, + "num_tokens": 430361922.0, + "step": 11276 + }, + { + "epoch": 1.4345503116651825, + "grad_norm": 1.396551251411438, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.88694828748703, + "num_tokens": 430404798.0, + "step": 11277 + }, + { + "epoch": 1.434677521943773, + "grad_norm": 1.3748042583465576, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8814022541046143, + "num_tokens": 430448218.0, + "step": 11278 + }, + { + "epoch": 1.4348047322223636, + "grad_norm": 1.4177454710006714, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8805949687957764, + "num_tokens": 430487070.0, + "step": 11279 + }, + { + "epoch": 1.4349319425009541, + "grad_norm": 1.5036065578460693, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8648592233657837, + "num_tokens": 430529943.0, + "step": 11280 + }, + { + "epoch": 1.4350591527795447, + "grad_norm": 1.3910636901855469, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8885019421577454, + "num_tokens": 430572636.0, + "step": 11281 + }, + { + "epoch": 1.435186363058135, + "grad_norm": 1.6502225399017334, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8756290674209595, + "num_tokens": 430608297.0, + "step": 11282 + }, + { + "epoch": 1.4353135733367255, + "grad_norm": 1.4981966018676758, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8796592950820923, + "num_tokens": 430643240.0, + "step": 11283 + }, + { + "epoch": 1.435440783615316, + "grad_norm": 1.7790403366088867, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.881079912185669, + "num_tokens": 430672298.0, + "step": 11284 + }, + { + "epoch": 1.4355679938939065, + "grad_norm": 1.4807777404785156, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8788323402404785, + "num_tokens": 430712391.0, + "step": 11285 + }, + { + "epoch": 1.435695204172497, + "grad_norm": 1.6134161949157715, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8869025707244873, + "num_tokens": 430750699.0, + "step": 11286 + }, + { + "epoch": 1.4358224144510876, + "grad_norm": 1.591028094291687, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8606191873550415, + "num_tokens": 430789123.0, + "step": 11287 + }, + { + "epoch": 1.4359496247296781, + "grad_norm": 1.5121803283691406, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.865761399269104, + "num_tokens": 430828453.0, + "step": 11288 + }, + { + "epoch": 1.4360768350082687, + "grad_norm": 1.4246644973754883, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8761600255966187, + "num_tokens": 430870773.0, + "step": 11289 + }, + { + "epoch": 1.4362040452868592, + "grad_norm": 1.6066795587539673, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8759958744049072, + "num_tokens": 430907183.0, + "step": 11290 + }, + { + "epoch": 1.4363312555654497, + "grad_norm": 1.4783527851104736, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8830010294914246, + "num_tokens": 430943050.0, + "step": 11291 + }, + { + "epoch": 1.4364584658440402, + "grad_norm": 1.4870274066925049, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8765993714332581, + "num_tokens": 430983018.0, + "step": 11292 + }, + { + "epoch": 1.4365856761226308, + "grad_norm": 1.5668931007385254, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8652366399765015, + "num_tokens": 431019029.0, + "step": 11293 + }, + { + "epoch": 1.4367128864012213, + "grad_norm": 1.4720900058746338, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8669014573097229, + "num_tokens": 431061060.0, + "step": 11294 + }, + { + "epoch": 1.4368400966798118, + "grad_norm": 1.616777777671814, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8826578855514526, + "num_tokens": 431090621.0, + "step": 11295 + }, + { + "epoch": 1.4369673069584024, + "grad_norm": 1.4705755710601807, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8743882179260254, + "num_tokens": 431129654.0, + "step": 11296 + }, + { + "epoch": 1.4370945172369929, + "grad_norm": 1.4031100273132324, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8923417925834656, + "num_tokens": 431165059.0, + "step": 11297 + }, + { + "epoch": 1.4372217275155832, + "grad_norm": 1.630640983581543, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.845887303352356, + "num_tokens": 431203356.0, + "step": 11298 + }, + { + "epoch": 1.4373489377941737, + "grad_norm": 1.3499085903167725, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8940658569335938, + "num_tokens": 431246910.0, + "step": 11299 + }, + { + "epoch": 1.4374761480727642, + "grad_norm": 1.5747538805007935, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8669641017913818, + "num_tokens": 431289224.0, + "step": 11300 + }, + { + "epoch": 1.4376033583513548, + "grad_norm": 1.358948826789856, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8846604824066162, + "num_tokens": 431331240.0, + "step": 11301 + }, + { + "epoch": 1.4377305686299453, + "grad_norm": 1.5214852094650269, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8647105693817139, + "num_tokens": 431373239.0, + "step": 11302 + }, + { + "epoch": 1.4378577789085358, + "grad_norm": 1.4333373308181763, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8815637826919556, + "num_tokens": 431415194.0, + "step": 11303 + }, + { + "epoch": 1.4379849891871264, + "grad_norm": 1.3219282627105713, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8855940103530884, + "num_tokens": 431461099.0, + "step": 11304 + }, + { + "epoch": 1.438112199465717, + "grad_norm": 1.5371689796447754, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8807440996170044, + "num_tokens": 431497380.0, + "step": 11305 + }, + { + "epoch": 1.4382394097443074, + "grad_norm": 1.5404462814331055, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8772659301757812, + "num_tokens": 431534011.0, + "step": 11306 + }, + { + "epoch": 1.4383666200228977, + "grad_norm": 1.5424729585647583, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.874930739402771, + "num_tokens": 431568797.0, + "step": 11307 + }, + { + "epoch": 1.4384938303014883, + "grad_norm": 1.684009313583374, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8843158483505249, + "num_tokens": 431596269.0, + "step": 11308 + }, + { + "epoch": 1.4386210405800788, + "grad_norm": 1.5145491361618042, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8873013257980347, + "num_tokens": 431633173.0, + "step": 11309 + }, + { + "epoch": 1.4387482508586693, + "grad_norm": 1.478082299232483, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8836305141448975, + "num_tokens": 431668291.0, + "step": 11310 + }, + { + "epoch": 1.4388754611372598, + "grad_norm": 1.5012552738189697, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8691892623901367, + "num_tokens": 431708707.0, + "step": 11311 + }, + { + "epoch": 1.4390026714158504, + "grad_norm": 1.5841906070709229, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8809018135070801, + "num_tokens": 431742896.0, + "step": 11312 + }, + { + "epoch": 1.439129881694441, + "grad_norm": 1.4442394971847534, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8758782744407654, + "num_tokens": 431781375.0, + "step": 11313 + }, + { + "epoch": 1.4392570919730314, + "grad_norm": 1.572721004486084, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8781575560569763, + "num_tokens": 431818625.0, + "step": 11314 + }, + { + "epoch": 1.439384302251622, + "grad_norm": 1.6492178440093994, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8623604774475098, + "num_tokens": 431858704.0, + "step": 11315 + }, + { + "epoch": 1.4395115125302125, + "grad_norm": 1.5482555627822876, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8778371810913086, + "num_tokens": 431897049.0, + "step": 11316 + }, + { + "epoch": 1.439638722808803, + "grad_norm": 1.540999174118042, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8788660168647766, + "num_tokens": 431932920.0, + "step": 11317 + }, + { + "epoch": 1.4397659330873935, + "grad_norm": 1.582585334777832, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8759541511535645, + "num_tokens": 431969005.0, + "step": 11318 + }, + { + "epoch": 1.439893143365984, + "grad_norm": 1.3446433544158936, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8812982439994812, + "num_tokens": 432013106.0, + "step": 11319 + }, + { + "epoch": 1.4400203536445746, + "grad_norm": 1.4492597579956055, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8773101568222046, + "num_tokens": 432051216.0, + "step": 11320 + }, + { + "epoch": 1.4401475639231651, + "grad_norm": 1.485470175743103, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8806876540184021, + "num_tokens": 432088660.0, + "step": 11321 + }, + { + "epoch": 1.4402747742017554, + "grad_norm": 1.4488123655319214, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8666571974754333, + "num_tokens": 432132446.0, + "step": 11322 + }, + { + "epoch": 1.440401984480346, + "grad_norm": 1.5648692846298218, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8760585784912109, + "num_tokens": 432165750.0, + "step": 11323 + }, + { + "epoch": 1.4405291947589365, + "grad_norm": 1.500846266746521, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.872675895690918, + "num_tokens": 432204658.0, + "step": 11324 + }, + { + "epoch": 1.440656405037527, + "grad_norm": 1.4329850673675537, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8793988823890686, + "num_tokens": 432245120.0, + "step": 11325 + }, + { + "epoch": 1.4407836153161175, + "grad_norm": 1.4353365898132324, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8752671480178833, + "num_tokens": 432285415.0, + "step": 11326 + }, + { + "epoch": 1.440910825594708, + "grad_norm": 1.474318027496338, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8865795731544495, + "num_tokens": 432323372.0, + "step": 11327 + }, + { + "epoch": 1.4410380358732986, + "grad_norm": 1.4701392650604248, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8662678003311157, + "num_tokens": 432365109.0, + "step": 11328 + }, + { + "epoch": 1.4411652461518891, + "grad_norm": 1.5399084091186523, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8679014444351196, + "num_tokens": 432403020.0, + "step": 11329 + }, + { + "epoch": 1.4412924564304797, + "grad_norm": 1.3931437730789185, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.864794135093689, + "num_tokens": 432451630.0, + "step": 11330 + }, + { + "epoch": 1.44141966670907, + "grad_norm": 1.4871007204055786, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8633848428726196, + "num_tokens": 432494072.0, + "step": 11331 + }, + { + "epoch": 1.4415468769876605, + "grad_norm": 1.4426329135894775, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.886823832988739, + "num_tokens": 432530248.0, + "step": 11332 + }, + { + "epoch": 1.441674087266251, + "grad_norm": 1.4827449321746826, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8562430143356323, + "num_tokens": 432572641.0, + "step": 11333 + }, + { + "epoch": 1.4418012975448415, + "grad_norm": 1.4534083604812622, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8684517741203308, + "num_tokens": 432614975.0, + "step": 11334 + }, + { + "epoch": 1.441928507823432, + "grad_norm": 1.4167581796646118, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8836989402770996, + "num_tokens": 432654876.0, + "step": 11335 + }, + { + "epoch": 1.4420557181020226, + "grad_norm": 1.6008530855178833, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8740518093109131, + "num_tokens": 432691951.0, + "step": 11336 + }, + { + "epoch": 1.4421829283806131, + "grad_norm": 1.501569151878357, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8671101927757263, + "num_tokens": 432732338.0, + "step": 11337 + }, + { + "epoch": 1.4423101386592037, + "grad_norm": 1.5308626890182495, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8731459379196167, + "num_tokens": 432775461.0, + "step": 11338 + }, + { + "epoch": 1.4424373489377942, + "grad_norm": 1.4926420450210571, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8574249744415283, + "num_tokens": 432817225.0, + "step": 11339 + }, + { + "epoch": 1.4425645592163847, + "grad_norm": 1.4845727682113647, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8570799827575684, + "num_tokens": 432860463.0, + "step": 11340 + }, + { + "epoch": 1.4426917694949752, + "grad_norm": 1.4985380172729492, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8789536952972412, + "num_tokens": 432896898.0, + "step": 11341 + }, + { + "epoch": 1.4428189797735658, + "grad_norm": 1.6132302284240723, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8709704875946045, + "num_tokens": 432932378.0, + "step": 11342 + }, + { + "epoch": 1.4429461900521563, + "grad_norm": 1.6006196737289429, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8668913841247559, + "num_tokens": 432968000.0, + "step": 11343 + }, + { + "epoch": 1.4430734003307468, + "grad_norm": 1.46607506275177, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8829639554023743, + "num_tokens": 433006746.0, + "step": 11344 + }, + { + "epoch": 1.4432006106093374, + "grad_norm": 1.5713834762573242, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8765748739242554, + "num_tokens": 433043656.0, + "step": 11345 + }, + { + "epoch": 1.4433278208879279, + "grad_norm": 1.4996840953826904, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8729010820388794, + "num_tokens": 433081932.0, + "step": 11346 + }, + { + "epoch": 1.4434550311665182, + "grad_norm": 1.5589396953582764, + "learning_rate": 1e-06, + "loss": 0.2795, + "mean_token_accuracy": 0.8986055254936218, + "num_tokens": 433115178.0, + "step": 11347 + }, + { + "epoch": 1.4435822414451087, + "grad_norm": 1.690995216369629, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8696151375770569, + "num_tokens": 433150025.0, + "step": 11348 + }, + { + "epoch": 1.4437094517236992, + "grad_norm": 1.6540085077285767, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8830190300941467, + "num_tokens": 433184201.0, + "step": 11349 + }, + { + "epoch": 1.4438366620022898, + "grad_norm": 1.4297431707382202, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8748157024383545, + "num_tokens": 433224071.0, + "step": 11350 + }, + { + "epoch": 1.4439638722808803, + "grad_norm": 1.5178989171981812, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8687739372253418, + "num_tokens": 433262784.0, + "step": 11351 + }, + { + "epoch": 1.4440910825594708, + "grad_norm": 1.6135313510894775, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8918559551239014, + "num_tokens": 433294562.0, + "step": 11352 + }, + { + "epoch": 1.4442182928380614, + "grad_norm": 1.5282549858093262, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.876712441444397, + "num_tokens": 433332918.0, + "step": 11353 + }, + { + "epoch": 1.4443455031166519, + "grad_norm": 1.5982612371444702, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8671550750732422, + "num_tokens": 433368951.0, + "step": 11354 + }, + { + "epoch": 1.4444727133952424, + "grad_norm": 1.4142593145370483, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8870111107826233, + "num_tokens": 433406957.0, + "step": 11355 + }, + { + "epoch": 1.4445999236738327, + "grad_norm": 1.5397738218307495, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8796894550323486, + "num_tokens": 433447244.0, + "step": 11356 + }, + { + "epoch": 1.4447271339524232, + "grad_norm": 1.6617964506149292, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8588192462921143, + "num_tokens": 433487504.0, + "step": 11357 + }, + { + "epoch": 1.4448543442310138, + "grad_norm": 1.4045917987823486, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8810921311378479, + "num_tokens": 433529751.0, + "step": 11358 + }, + { + "epoch": 1.4449815545096043, + "grad_norm": 1.5500335693359375, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.872125506401062, + "num_tokens": 433566728.0, + "step": 11359 + }, + { + "epoch": 1.4451087647881948, + "grad_norm": 1.5908631086349487, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8618754148483276, + "num_tokens": 433604911.0, + "step": 11360 + }, + { + "epoch": 1.4452359750667854, + "grad_norm": 1.444144368171692, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8884305953979492, + "num_tokens": 433644856.0, + "step": 11361 + }, + { + "epoch": 1.445363185345376, + "grad_norm": 1.382395625114441, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8700820207595825, + "num_tokens": 433687630.0, + "step": 11362 + }, + { + "epoch": 1.4454903956239664, + "grad_norm": 1.624464511871338, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8699137568473816, + "num_tokens": 433726918.0, + "step": 11363 + }, + { + "epoch": 1.445617605902557, + "grad_norm": 1.4757211208343506, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8724147081375122, + "num_tokens": 433767332.0, + "step": 11364 + }, + { + "epoch": 1.4457448161811475, + "grad_norm": 1.4794648885726929, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8930926322937012, + "num_tokens": 433801509.0, + "step": 11365 + }, + { + "epoch": 1.445872026459738, + "grad_norm": 1.6291699409484863, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8757764101028442, + "num_tokens": 433837288.0, + "step": 11366 + }, + { + "epoch": 1.4459992367383285, + "grad_norm": 1.4511196613311768, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8832919597625732, + "num_tokens": 433875765.0, + "step": 11367 + }, + { + "epoch": 1.446126447016919, + "grad_norm": 1.410656452178955, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8925756216049194, + "num_tokens": 433914354.0, + "step": 11368 + }, + { + "epoch": 1.4462536572955096, + "grad_norm": 1.4862568378448486, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8674746751785278, + "num_tokens": 433955035.0, + "step": 11369 + }, + { + "epoch": 1.4463808675741001, + "grad_norm": 1.6473647356033325, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8780232071876526, + "num_tokens": 433987818.0, + "step": 11370 + }, + { + "epoch": 1.4465080778526904, + "grad_norm": 1.5476857423782349, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8816008567810059, + "num_tokens": 434026841.0, + "step": 11371 + }, + { + "epoch": 1.446635288131281, + "grad_norm": 1.4258016347885132, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8929075002670288, + "num_tokens": 434065614.0, + "step": 11372 + }, + { + "epoch": 1.4467624984098715, + "grad_norm": 1.4047491550445557, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.878730058670044, + "num_tokens": 434106580.0, + "step": 11373 + }, + { + "epoch": 1.446889708688462, + "grad_norm": 1.5427719354629517, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8725994229316711, + "num_tokens": 434143609.0, + "step": 11374 + }, + { + "epoch": 1.4470169189670525, + "grad_norm": 1.446588158607483, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8924655318260193, + "num_tokens": 434180668.0, + "step": 11375 + }, + { + "epoch": 1.447144129245643, + "grad_norm": 1.4829347133636475, + "learning_rate": 1e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.8992141485214233, + "num_tokens": 434216126.0, + "step": 11376 + }, + { + "epoch": 1.4472713395242336, + "grad_norm": 1.4092015027999878, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8930844664573669, + "num_tokens": 434255114.0, + "step": 11377 + }, + { + "epoch": 1.4473985498028241, + "grad_norm": 1.7330162525177002, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8566376566886902, + "num_tokens": 434290342.0, + "step": 11378 + }, + { + "epoch": 1.4475257600814146, + "grad_norm": 1.474532127380371, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.882986307144165, + "num_tokens": 434333470.0, + "step": 11379 + }, + { + "epoch": 1.447652970360005, + "grad_norm": 1.730515718460083, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8765774965286255, + "num_tokens": 434364223.0, + "step": 11380 + }, + { + "epoch": 1.4477801806385955, + "grad_norm": 1.4360142946243286, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8846850991249084, + "num_tokens": 434401601.0, + "step": 11381 + }, + { + "epoch": 1.447907390917186, + "grad_norm": 1.3921959400177002, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8816406726837158, + "num_tokens": 434444160.0, + "step": 11382 + }, + { + "epoch": 1.4480346011957765, + "grad_norm": 1.4720354080200195, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8822099566459656, + "num_tokens": 434482141.0, + "step": 11383 + }, + { + "epoch": 1.448161811474367, + "grad_norm": 1.3995599746704102, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.8939061164855957, + "num_tokens": 434518817.0, + "step": 11384 + }, + { + "epoch": 1.4482890217529576, + "grad_norm": 1.4337007999420166, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8753522634506226, + "num_tokens": 434561783.0, + "step": 11385 + }, + { + "epoch": 1.4484162320315481, + "grad_norm": 1.743588924407959, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8633825778961182, + "num_tokens": 434599730.0, + "step": 11386 + }, + { + "epoch": 1.4485434423101387, + "grad_norm": 1.4969714879989624, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8655122518539429, + "num_tokens": 434640657.0, + "step": 11387 + }, + { + "epoch": 1.4486706525887292, + "grad_norm": 1.4210225343704224, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8902667760848999, + "num_tokens": 434679851.0, + "step": 11388 + }, + { + "epoch": 1.4487978628673197, + "grad_norm": 1.562807321548462, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8745141625404358, + "num_tokens": 434716080.0, + "step": 11389 + }, + { + "epoch": 1.4489250731459102, + "grad_norm": 1.3866182565689087, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8851654529571533, + "num_tokens": 434757965.0, + "step": 11390 + }, + { + "epoch": 1.4490522834245008, + "grad_norm": 1.5357229709625244, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8728257417678833, + "num_tokens": 434796652.0, + "step": 11391 + }, + { + "epoch": 1.4491794937030913, + "grad_norm": 1.4523117542266846, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8789359331130981, + "num_tokens": 434836689.0, + "step": 11392 + }, + { + "epoch": 1.4493067039816818, + "grad_norm": 1.4379355907440186, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8795103430747986, + "num_tokens": 434875075.0, + "step": 11393 + }, + { + "epoch": 1.4494339142602723, + "grad_norm": 1.4982205629348755, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8746514320373535, + "num_tokens": 434914311.0, + "step": 11394 + }, + { + "epoch": 1.4495611245388629, + "grad_norm": 1.6190271377563477, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8643547892570496, + "num_tokens": 434954289.0, + "step": 11395 + }, + { + "epoch": 1.4496883348174532, + "grad_norm": 1.6140145063400269, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8690024018287659, + "num_tokens": 434990657.0, + "step": 11396 + }, + { + "epoch": 1.4498155450960437, + "grad_norm": 1.406177043914795, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.892683744430542, + "num_tokens": 435030150.0, + "step": 11397 + }, + { + "epoch": 1.4499427553746342, + "grad_norm": 1.502646565437317, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8758443593978882, + "num_tokens": 435070050.0, + "step": 11398 + }, + { + "epoch": 1.4500699656532248, + "grad_norm": 1.4476768970489502, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8760601282119751, + "num_tokens": 435110846.0, + "step": 11399 + }, + { + "epoch": 1.4501971759318153, + "grad_norm": 1.5819292068481445, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8659422993659973, + "num_tokens": 435149906.0, + "step": 11400 + }, + { + "epoch": 1.4503243862104058, + "grad_norm": 1.3781836032867432, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.889263391494751, + "num_tokens": 435193082.0, + "step": 11401 + }, + { + "epoch": 1.4504515964889964, + "grad_norm": 1.6132367849349976, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8833035230636597, + "num_tokens": 435223428.0, + "step": 11402 + }, + { + "epoch": 1.4505788067675869, + "grad_norm": 1.517756700515747, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8740691542625427, + "num_tokens": 435266426.0, + "step": 11403 + }, + { + "epoch": 1.4507060170461774, + "grad_norm": 1.4453753232955933, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8723254799842834, + "num_tokens": 435307027.0, + "step": 11404 + }, + { + "epoch": 1.4508332273247677, + "grad_norm": 1.3106600046157837, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8921887874603271, + "num_tokens": 435349127.0, + "step": 11405 + }, + { + "epoch": 1.4509604376033582, + "grad_norm": 1.5865366458892822, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8767752647399902, + "num_tokens": 435384523.0, + "step": 11406 + }, + { + "epoch": 1.4510876478819488, + "grad_norm": 1.5549445152282715, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.870868980884552, + "num_tokens": 435423504.0, + "step": 11407 + }, + { + "epoch": 1.4512148581605393, + "grad_norm": 1.6389799118041992, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8783420920372009, + "num_tokens": 435454798.0, + "step": 11408 + }, + { + "epoch": 1.4513420684391298, + "grad_norm": 1.5700664520263672, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8929901123046875, + "num_tokens": 435490678.0, + "step": 11409 + }, + { + "epoch": 1.4514692787177204, + "grad_norm": 1.47354257106781, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8859336376190186, + "num_tokens": 435527010.0, + "step": 11410 + }, + { + "epoch": 1.4515964889963109, + "grad_norm": 1.3757014274597168, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8808178901672363, + "num_tokens": 435568706.0, + "step": 11411 + }, + { + "epoch": 1.4517236992749014, + "grad_norm": 1.5072277784347534, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8801143765449524, + "num_tokens": 435607029.0, + "step": 11412 + }, + { + "epoch": 1.451850909553492, + "grad_norm": 1.4070982933044434, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8864736557006836, + "num_tokens": 435646380.0, + "step": 11413 + }, + { + "epoch": 1.4519781198320825, + "grad_norm": 1.466629147529602, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8743479251861572, + "num_tokens": 435685701.0, + "step": 11414 + }, + { + "epoch": 1.452105330110673, + "grad_norm": 1.5215661525726318, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8763605952262878, + "num_tokens": 435723123.0, + "step": 11415 + }, + { + "epoch": 1.4522325403892635, + "grad_norm": 1.6345620155334473, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8511972427368164, + "num_tokens": 435761013.0, + "step": 11416 + }, + { + "epoch": 1.452359750667854, + "grad_norm": 1.589471459388733, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8856977224349976, + "num_tokens": 435791471.0, + "step": 11417 + }, + { + "epoch": 1.4524869609464446, + "grad_norm": 1.3760253190994263, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8817329406738281, + "num_tokens": 435837638.0, + "step": 11418 + }, + { + "epoch": 1.4526141712250351, + "grad_norm": 1.5879359245300293, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8629854917526245, + "num_tokens": 435871819.0, + "step": 11419 + }, + { + "epoch": 1.4527413815036254, + "grad_norm": 1.493686318397522, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8742373585700989, + "num_tokens": 435911317.0, + "step": 11420 + }, + { + "epoch": 1.452868591782216, + "grad_norm": 1.5695174932479858, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8511797785758972, + "num_tokens": 435952672.0, + "step": 11421 + }, + { + "epoch": 1.4529958020608065, + "grad_norm": 1.4587595462799072, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.869206428527832, + "num_tokens": 435992608.0, + "step": 11422 + }, + { + "epoch": 1.453123012339397, + "grad_norm": 1.3943032026290894, + "learning_rate": 1e-06, + "loss": 0.2623, + "mean_token_accuracy": 0.9059939980506897, + "num_tokens": 436026936.0, + "step": 11423 + }, + { + "epoch": 1.4532502226179875, + "grad_norm": 1.4346630573272705, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8844031095504761, + "num_tokens": 436065598.0, + "step": 11424 + }, + { + "epoch": 1.453377432896578, + "grad_norm": 1.3494833707809448, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8734151124954224, + "num_tokens": 436112669.0, + "step": 11425 + }, + { + "epoch": 1.4535046431751686, + "grad_norm": 1.4364861249923706, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.877052903175354, + "num_tokens": 436153345.0, + "step": 11426 + }, + { + "epoch": 1.4536318534537591, + "grad_norm": 1.4351967573165894, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8732962608337402, + "num_tokens": 436198814.0, + "step": 11427 + }, + { + "epoch": 1.4537590637323496, + "grad_norm": 1.5644582509994507, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8908131122589111, + "num_tokens": 436233394.0, + "step": 11428 + }, + { + "epoch": 1.45388627401094, + "grad_norm": 1.441325306892395, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.887478232383728, + "num_tokens": 436273483.0, + "step": 11429 + }, + { + "epoch": 1.4540134842895305, + "grad_norm": 1.439950704574585, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8823387026786804, + "num_tokens": 436313280.0, + "step": 11430 + }, + { + "epoch": 1.454140694568121, + "grad_norm": 1.4013047218322754, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.880998969078064, + "num_tokens": 436352952.0, + "step": 11431 + }, + { + "epoch": 1.4542679048467115, + "grad_norm": 1.5053367614746094, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.879061758518219, + "num_tokens": 436388290.0, + "step": 11432 + }, + { + "epoch": 1.454395115125302, + "grad_norm": 1.7004588842391968, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8701473474502563, + "num_tokens": 436421223.0, + "step": 11433 + }, + { + "epoch": 1.4545223254038926, + "grad_norm": 1.4467847347259521, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.872115433216095, + "num_tokens": 436462027.0, + "step": 11434 + }, + { + "epoch": 1.4546495356824831, + "grad_norm": 1.6424891948699951, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8731327056884766, + "num_tokens": 436494430.0, + "step": 11435 + }, + { + "epoch": 1.4547767459610736, + "grad_norm": 1.5015867948532104, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8576313257217407, + "num_tokens": 436536383.0, + "step": 11436 + }, + { + "epoch": 1.4549039562396642, + "grad_norm": 1.6309551000595093, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.884475588798523, + "num_tokens": 436566643.0, + "step": 11437 + }, + { + "epoch": 1.4550311665182547, + "grad_norm": 1.573940634727478, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8771733045578003, + "num_tokens": 436600469.0, + "step": 11438 + }, + { + "epoch": 1.4551583767968452, + "grad_norm": 1.4862452745437622, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.88545823097229, + "num_tokens": 436635256.0, + "step": 11439 + }, + { + "epoch": 1.4552855870754358, + "grad_norm": 1.457275152206421, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8783996105194092, + "num_tokens": 436671026.0, + "step": 11440 + }, + { + "epoch": 1.4554127973540263, + "grad_norm": 1.4875019788742065, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.872273325920105, + "num_tokens": 436710183.0, + "step": 11441 + }, + { + "epoch": 1.4555400076326168, + "grad_norm": 1.5428509712219238, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8615228533744812, + "num_tokens": 436749792.0, + "step": 11442 + }, + { + "epoch": 1.4556672179112073, + "grad_norm": 1.4727474451065063, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8896910548210144, + "num_tokens": 436787831.0, + "step": 11443 + }, + { + "epoch": 1.4557944281897979, + "grad_norm": 1.5569976568222046, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.87457275390625, + "num_tokens": 436822830.0, + "step": 11444 + }, + { + "epoch": 1.4559216384683882, + "grad_norm": 1.4711027145385742, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8708783388137817, + "num_tokens": 436864097.0, + "step": 11445 + }, + { + "epoch": 1.4560488487469787, + "grad_norm": 1.4669266939163208, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8679843544960022, + "num_tokens": 436904550.0, + "step": 11446 + }, + { + "epoch": 1.4561760590255692, + "grad_norm": 1.5335642099380493, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8763980865478516, + "num_tokens": 436939799.0, + "step": 11447 + }, + { + "epoch": 1.4563032693041598, + "grad_norm": 1.492540955543518, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8767301440238953, + "num_tokens": 436974313.0, + "step": 11448 + }, + { + "epoch": 1.4564304795827503, + "grad_norm": 1.5034430027008057, + "learning_rate": 1e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.8932784795761108, + "num_tokens": 437007000.0, + "step": 11449 + }, + { + "epoch": 1.4565576898613408, + "grad_norm": 1.482967495918274, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8861450552940369, + "num_tokens": 437045311.0, + "step": 11450 + }, + { + "epoch": 1.4566849001399313, + "grad_norm": 1.4762306213378906, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8846195340156555, + "num_tokens": 437085897.0, + "step": 11451 + }, + { + "epoch": 1.4568121104185219, + "grad_norm": 1.6100934743881226, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8832119703292847, + "num_tokens": 437117898.0, + "step": 11452 + }, + { + "epoch": 1.4569393206971124, + "grad_norm": 1.5377817153930664, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8775776028633118, + "num_tokens": 437156358.0, + "step": 11453 + }, + { + "epoch": 1.4570665309757027, + "grad_norm": 1.6821370124816895, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8679498434066772, + "num_tokens": 437188544.0, + "step": 11454 + }, + { + "epoch": 1.4571937412542932, + "grad_norm": 1.5533796548843384, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8847373127937317, + "num_tokens": 437221470.0, + "step": 11455 + }, + { + "epoch": 1.4573209515328838, + "grad_norm": 1.6005158424377441, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8863701820373535, + "num_tokens": 437257171.0, + "step": 11456 + }, + { + "epoch": 1.4574481618114743, + "grad_norm": 1.6552505493164062, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8718076944351196, + "num_tokens": 437290719.0, + "step": 11457 + }, + { + "epoch": 1.4575753720900648, + "grad_norm": 1.509293556213379, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8671000599861145, + "num_tokens": 437331979.0, + "step": 11458 + }, + { + "epoch": 1.4577025823686554, + "grad_norm": 1.6222164630889893, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8691005706787109, + "num_tokens": 437373209.0, + "step": 11459 + }, + { + "epoch": 1.4578297926472459, + "grad_norm": 1.5012421607971191, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8691143989562988, + "num_tokens": 437413787.0, + "step": 11460 + }, + { + "epoch": 1.4579570029258364, + "grad_norm": 1.5383557081222534, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8803073763847351, + "num_tokens": 437448430.0, + "step": 11461 + }, + { + "epoch": 1.458084213204427, + "grad_norm": 1.3854212760925293, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8817485570907593, + "num_tokens": 437490800.0, + "step": 11462 + }, + { + "epoch": 1.4582114234830175, + "grad_norm": 1.4742777347564697, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8675873279571533, + "num_tokens": 437531973.0, + "step": 11463 + }, + { + "epoch": 1.458338633761608, + "grad_norm": 1.6377099752426147, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8669251203536987, + "num_tokens": 437566864.0, + "step": 11464 + }, + { + "epoch": 1.4584658440401985, + "grad_norm": 1.3298039436340332, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8912832736968994, + "num_tokens": 437609266.0, + "step": 11465 + }, + { + "epoch": 1.458593054318789, + "grad_norm": 1.4048165082931519, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8837597370147705, + "num_tokens": 437651232.0, + "step": 11466 + }, + { + "epoch": 1.4587202645973796, + "grad_norm": 1.516891360282898, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8743009567260742, + "num_tokens": 437691186.0, + "step": 11467 + }, + { + "epoch": 1.45884747487597, + "grad_norm": 1.5153354406356812, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8684765100479126, + "num_tokens": 437730964.0, + "step": 11468 + }, + { + "epoch": 1.4589746851545604, + "grad_norm": 1.408447504043579, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8795613050460815, + "num_tokens": 437769258.0, + "step": 11469 + }, + { + "epoch": 1.459101895433151, + "grad_norm": 1.5553754568099976, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8806115388870239, + "num_tokens": 437802358.0, + "step": 11470 + }, + { + "epoch": 1.4592291057117415, + "grad_norm": 1.422210693359375, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8804895281791687, + "num_tokens": 437843978.0, + "step": 11471 + }, + { + "epoch": 1.459356315990332, + "grad_norm": 1.6056396961212158, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8709826469421387, + "num_tokens": 437877292.0, + "step": 11472 + }, + { + "epoch": 1.4594835262689225, + "grad_norm": 1.423690676689148, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8780953884124756, + "num_tokens": 437917776.0, + "step": 11473 + }, + { + "epoch": 1.459610736547513, + "grad_norm": 1.3301626443862915, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8914763331413269, + "num_tokens": 437959322.0, + "step": 11474 + }, + { + "epoch": 1.4597379468261036, + "grad_norm": 1.5638238191604614, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8607770800590515, + "num_tokens": 437995080.0, + "step": 11475 + }, + { + "epoch": 1.4598651571046941, + "grad_norm": 1.4802371263504028, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8692456483840942, + "num_tokens": 438036309.0, + "step": 11476 + }, + { + "epoch": 1.4599923673832846, + "grad_norm": 1.5751813650131226, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8819565773010254, + "num_tokens": 438070082.0, + "step": 11477 + }, + { + "epoch": 1.460119577661875, + "grad_norm": 1.362989068031311, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8813812136650085, + "num_tokens": 438113881.0, + "step": 11478 + }, + { + "epoch": 1.4602467879404655, + "grad_norm": 1.624060034751892, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8682253360748291, + "num_tokens": 438154564.0, + "step": 11479 + }, + { + "epoch": 1.460373998219056, + "grad_norm": 1.5287330150604248, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8763976693153381, + "num_tokens": 438194084.0, + "step": 11480 + }, + { + "epoch": 1.4605012084976465, + "grad_norm": 1.5266697406768799, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8806231021881104, + "num_tokens": 438230465.0, + "step": 11481 + }, + { + "epoch": 1.460628418776237, + "grad_norm": 1.3954405784606934, + "learning_rate": 1e-06, + "loss": 0.2762, + "mean_token_accuracy": 0.8945768475532532, + "num_tokens": 438268311.0, + "step": 11482 + }, + { + "epoch": 1.4607556290548276, + "grad_norm": 1.5661377906799316, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8853791356086731, + "num_tokens": 438300198.0, + "step": 11483 + }, + { + "epoch": 1.4608828393334181, + "grad_norm": 1.5890413522720337, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8805018067359924, + "num_tokens": 438331436.0, + "step": 11484 + }, + { + "epoch": 1.4610100496120086, + "grad_norm": 1.451542854309082, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8913542032241821, + "num_tokens": 438369018.0, + "step": 11485 + }, + { + "epoch": 1.4611372598905992, + "grad_norm": 1.6142077445983887, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8725600242614746, + "num_tokens": 438405876.0, + "step": 11486 + }, + { + "epoch": 1.4612644701691897, + "grad_norm": 1.4120042324066162, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8899462819099426, + "num_tokens": 438446565.0, + "step": 11487 + }, + { + "epoch": 1.4613916804477802, + "grad_norm": 1.377442479133606, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.887295126914978, + "num_tokens": 438488834.0, + "step": 11488 + }, + { + "epoch": 1.4615188907263708, + "grad_norm": 1.4888652563095093, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8688880801200867, + "num_tokens": 438527571.0, + "step": 11489 + }, + { + "epoch": 1.4616461010049613, + "grad_norm": 1.5551713705062866, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.868800938129425, + "num_tokens": 438563811.0, + "step": 11490 + }, + { + "epoch": 1.4617733112835518, + "grad_norm": 1.6594338417053223, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8589958548545837, + "num_tokens": 438597498.0, + "step": 11491 + }, + { + "epoch": 1.4619005215621423, + "grad_norm": 1.4565900564193726, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8590356707572937, + "num_tokens": 438640534.0, + "step": 11492 + }, + { + "epoch": 1.4620277318407329, + "grad_norm": 1.4365897178649902, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8643308281898499, + "num_tokens": 438682763.0, + "step": 11493 + }, + { + "epoch": 1.4621549421193232, + "grad_norm": 1.5707422494888306, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8742386102676392, + "num_tokens": 438722034.0, + "step": 11494 + }, + { + "epoch": 1.4622821523979137, + "grad_norm": 1.4315807819366455, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.891625165939331, + "num_tokens": 438759280.0, + "step": 11495 + }, + { + "epoch": 1.4624093626765042, + "grad_norm": 1.5351896286010742, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8550887703895569, + "num_tokens": 438799898.0, + "step": 11496 + }, + { + "epoch": 1.4625365729550948, + "grad_norm": 1.5348658561706543, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8650999069213867, + "num_tokens": 438842534.0, + "step": 11497 + }, + { + "epoch": 1.4626637832336853, + "grad_norm": 1.7102136611938477, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8492264151573181, + "num_tokens": 438873894.0, + "step": 11498 + }, + { + "epoch": 1.4627909935122758, + "grad_norm": 1.5261667966842651, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8864596486091614, + "num_tokens": 438909215.0, + "step": 11499 + }, + { + "epoch": 1.4629182037908663, + "grad_norm": 1.6609371900558472, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8683904409408569, + "num_tokens": 438944741.0, + "step": 11500 + }, + { + "epoch": 1.4630454140694569, + "grad_norm": 1.4717352390289307, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8594350814819336, + "num_tokens": 438984097.0, + "step": 11501 + }, + { + "epoch": 1.4631726243480474, + "grad_norm": 1.494760513305664, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8818589448928833, + "num_tokens": 439021945.0, + "step": 11502 + }, + { + "epoch": 1.4632998346266377, + "grad_norm": 1.4015629291534424, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8684561252593994, + "num_tokens": 439064641.0, + "step": 11503 + }, + { + "epoch": 1.4634270449052282, + "grad_norm": 1.4695403575897217, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8894736170768738, + "num_tokens": 439103571.0, + "step": 11504 + }, + { + "epoch": 1.4635542551838188, + "grad_norm": 1.6246639490127563, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8902180790901184, + "num_tokens": 439134995.0, + "step": 11505 + }, + { + "epoch": 1.4636814654624093, + "grad_norm": 1.3180876970291138, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8856310248374939, + "num_tokens": 439177189.0, + "step": 11506 + }, + { + "epoch": 1.4638086757409998, + "grad_norm": 1.4335798025131226, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.872349739074707, + "num_tokens": 439220797.0, + "step": 11507 + }, + { + "epoch": 1.4639358860195903, + "grad_norm": 1.4667032957077026, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8572676181793213, + "num_tokens": 439265716.0, + "step": 11508 + }, + { + "epoch": 1.4640630962981809, + "grad_norm": 1.509757161140442, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.873621940612793, + "num_tokens": 439305495.0, + "step": 11509 + }, + { + "epoch": 1.4641903065767714, + "grad_norm": 1.4105217456817627, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.8974428176879883, + "num_tokens": 439344088.0, + "step": 11510 + }, + { + "epoch": 1.464317516855362, + "grad_norm": 1.550811767578125, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8746873140335083, + "num_tokens": 439381024.0, + "step": 11511 + }, + { + "epoch": 1.4644447271339525, + "grad_norm": 1.4856151342391968, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8808223605155945, + "num_tokens": 439418788.0, + "step": 11512 + }, + { + "epoch": 1.464571937412543, + "grad_norm": 1.572145700454712, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8693955540657043, + "num_tokens": 439456742.0, + "step": 11513 + }, + { + "epoch": 1.4646991476911335, + "grad_norm": 1.5445842742919922, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8757474422454834, + "num_tokens": 439498078.0, + "step": 11514 + }, + { + "epoch": 1.464826357969724, + "grad_norm": 1.3859810829162598, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8921864032745361, + "num_tokens": 439537278.0, + "step": 11515 + }, + { + "epoch": 1.4649535682483146, + "grad_norm": 1.5813833475112915, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8863860964775085, + "num_tokens": 439571323.0, + "step": 11516 + }, + { + "epoch": 1.465080778526905, + "grad_norm": 1.4659264087677002, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8780637979507446, + "num_tokens": 439610918.0, + "step": 11517 + }, + { + "epoch": 1.4652079888054954, + "grad_norm": 1.398163914680481, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8707836866378784, + "num_tokens": 439657402.0, + "step": 11518 + }, + { + "epoch": 1.465335199084086, + "grad_norm": 1.6650878190994263, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8873202204704285, + "num_tokens": 439691935.0, + "step": 11519 + }, + { + "epoch": 1.4654624093626765, + "grad_norm": 1.501778483390808, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8825108408927917, + "num_tokens": 439729606.0, + "step": 11520 + }, + { + "epoch": 1.465589619641267, + "grad_norm": 1.4498130083084106, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8636047840118408, + "num_tokens": 439771411.0, + "step": 11521 + }, + { + "epoch": 1.4657168299198575, + "grad_norm": 1.4611490964889526, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.884450376033783, + "num_tokens": 439807887.0, + "step": 11522 + }, + { + "epoch": 1.465844040198448, + "grad_norm": 1.5046663284301758, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8759282827377319, + "num_tokens": 439847825.0, + "step": 11523 + }, + { + "epoch": 1.4659712504770386, + "grad_norm": 1.6209118366241455, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8790441155433655, + "num_tokens": 439885123.0, + "step": 11524 + }, + { + "epoch": 1.466098460755629, + "grad_norm": 1.615427851676941, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8642199039459229, + "num_tokens": 439920899.0, + "step": 11525 + }, + { + "epoch": 1.4662256710342196, + "grad_norm": 1.5643898248672485, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.888432502746582, + "num_tokens": 439951464.0, + "step": 11526 + }, + { + "epoch": 1.46635288131281, + "grad_norm": 1.407108187675476, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8771041035652161, + "num_tokens": 439993230.0, + "step": 11527 + }, + { + "epoch": 1.4664800915914005, + "grad_norm": 1.467242956161499, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.884410560131073, + "num_tokens": 440032402.0, + "step": 11528 + }, + { + "epoch": 1.466607301869991, + "grad_norm": 1.5120165348052979, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8790949583053589, + "num_tokens": 440072692.0, + "step": 11529 + }, + { + "epoch": 1.4667345121485815, + "grad_norm": 1.3900706768035889, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8813040256500244, + "num_tokens": 440116142.0, + "step": 11530 + }, + { + "epoch": 1.466861722427172, + "grad_norm": 1.3319758176803589, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8964279890060425, + "num_tokens": 440159570.0, + "step": 11531 + }, + { + "epoch": 1.4669889327057626, + "grad_norm": 1.5123286247253418, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8733162879943848, + "num_tokens": 440198226.0, + "step": 11532 + }, + { + "epoch": 1.467116142984353, + "grad_norm": 1.2858184576034546, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8864234685897827, + "num_tokens": 440243338.0, + "step": 11533 + }, + { + "epoch": 1.4672433532629436, + "grad_norm": 1.6429259777069092, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8617923259735107, + "num_tokens": 440282082.0, + "step": 11534 + }, + { + "epoch": 1.4673705635415342, + "grad_norm": 1.5207836627960205, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8707090616226196, + "num_tokens": 440320384.0, + "step": 11535 + }, + { + "epoch": 1.4674977738201247, + "grad_norm": 1.5691708326339722, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8560107350349426, + "num_tokens": 440360136.0, + "step": 11536 + }, + { + "epoch": 1.4676249840987152, + "grad_norm": 1.4899790287017822, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8752927780151367, + "num_tokens": 440398164.0, + "step": 11537 + }, + { + "epoch": 1.4677521943773058, + "grad_norm": 1.6447159051895142, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8535665273666382, + "num_tokens": 440432776.0, + "step": 11538 + }, + { + "epoch": 1.4678794046558963, + "grad_norm": 1.547837734222412, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8802501559257507, + "num_tokens": 440467710.0, + "step": 11539 + }, + { + "epoch": 1.4680066149344868, + "grad_norm": 1.4913369417190552, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8763731122016907, + "num_tokens": 440508408.0, + "step": 11540 + }, + { + "epoch": 1.4681338252130773, + "grad_norm": 1.3125593662261963, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8957877159118652, + "num_tokens": 440550281.0, + "step": 11541 + }, + { + "epoch": 1.4682610354916679, + "grad_norm": 1.4871268272399902, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8701364398002625, + "num_tokens": 440590172.0, + "step": 11542 + }, + { + "epoch": 1.4683882457702582, + "grad_norm": 1.4496573209762573, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8848851919174194, + "num_tokens": 440628417.0, + "step": 11543 + }, + { + "epoch": 1.4685154560488487, + "grad_norm": 1.4918264150619507, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8862836956977844, + "num_tokens": 440666042.0, + "step": 11544 + }, + { + "epoch": 1.4686426663274392, + "grad_norm": 1.5495511293411255, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8587481379508972, + "num_tokens": 440706855.0, + "step": 11545 + }, + { + "epoch": 1.4687698766060298, + "grad_norm": 1.4076682329177856, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8871899247169495, + "num_tokens": 440747811.0, + "step": 11546 + }, + { + "epoch": 1.4688970868846203, + "grad_norm": 1.455473780632019, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8814719915390015, + "num_tokens": 440786631.0, + "step": 11547 + }, + { + "epoch": 1.4690242971632108, + "grad_norm": 1.5914068222045898, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8725304007530212, + "num_tokens": 440823065.0, + "step": 11548 + }, + { + "epoch": 1.4691515074418013, + "grad_norm": 1.5810716152191162, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8672685027122498, + "num_tokens": 440857763.0, + "step": 11549 + }, + { + "epoch": 1.4692787177203919, + "grad_norm": 1.4850084781646729, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.879301905632019, + "num_tokens": 440898479.0, + "step": 11550 + }, + { + "epoch": 1.4694059279989824, + "grad_norm": 1.87924063205719, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8687169551849365, + "num_tokens": 440929573.0, + "step": 11551 + }, + { + "epoch": 1.4695331382775727, + "grad_norm": 1.4547314643859863, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.894993782043457, + "num_tokens": 440968014.0, + "step": 11552 + }, + { + "epoch": 1.4696603485561632, + "grad_norm": 1.5748496055603027, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8614212870597839, + "num_tokens": 441005550.0, + "step": 11553 + }, + { + "epoch": 1.4697875588347538, + "grad_norm": 1.5015833377838135, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8853042125701904, + "num_tokens": 441041148.0, + "step": 11554 + }, + { + "epoch": 1.4699147691133443, + "grad_norm": 1.3332109451293945, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8783779144287109, + "num_tokens": 441087421.0, + "step": 11555 + }, + { + "epoch": 1.4700419793919348, + "grad_norm": 1.4223421812057495, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8839642405509949, + "num_tokens": 441129667.0, + "step": 11556 + }, + { + "epoch": 1.4701691896705253, + "grad_norm": 1.5607279539108276, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8838404417037964, + "num_tokens": 441163348.0, + "step": 11557 + }, + { + "epoch": 1.4702963999491159, + "grad_norm": 1.4733222723007202, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8871315717697144, + "num_tokens": 441197981.0, + "step": 11558 + }, + { + "epoch": 1.4704236102277064, + "grad_norm": 1.4179767370224, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8828452825546265, + "num_tokens": 441235951.0, + "step": 11559 + }, + { + "epoch": 1.470550820506297, + "grad_norm": 1.5950971841812134, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.865410566329956, + "num_tokens": 441277307.0, + "step": 11560 + }, + { + "epoch": 1.4706780307848875, + "grad_norm": 1.493560791015625, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8669785261154175, + "num_tokens": 441316042.0, + "step": 11561 + }, + { + "epoch": 1.470805241063478, + "grad_norm": 1.4433971643447876, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.8969398140907288, + "num_tokens": 441353809.0, + "step": 11562 + }, + { + "epoch": 1.4709324513420685, + "grad_norm": 1.4954177141189575, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8787358999252319, + "num_tokens": 441392223.0, + "step": 11563 + }, + { + "epoch": 1.471059661620659, + "grad_norm": 1.416128396987915, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8724032044410706, + "num_tokens": 441435160.0, + "step": 11564 + }, + { + "epoch": 1.4711868718992496, + "grad_norm": 1.5298879146575928, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8747751712799072, + "num_tokens": 441474183.0, + "step": 11565 + }, + { + "epoch": 1.47131408217784, + "grad_norm": 1.6191731691360474, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8677870035171509, + "num_tokens": 441507547.0, + "step": 11566 + }, + { + "epoch": 1.4714412924564304, + "grad_norm": 1.5980192422866821, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8816137313842773, + "num_tokens": 441540670.0, + "step": 11567 + }, + { + "epoch": 1.471568502735021, + "grad_norm": 1.5085667371749878, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8758493065834045, + "num_tokens": 441578534.0, + "step": 11568 + }, + { + "epoch": 1.4716957130136115, + "grad_norm": 1.8973451852798462, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8839455842971802, + "num_tokens": 441613233.0, + "step": 11569 + }, + { + "epoch": 1.471822923292202, + "grad_norm": 1.5249117612838745, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8844040632247925, + "num_tokens": 441647157.0, + "step": 11570 + }, + { + "epoch": 1.4719501335707925, + "grad_norm": 1.4475984573364258, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8737025260925293, + "num_tokens": 441690563.0, + "step": 11571 + }, + { + "epoch": 1.472077343849383, + "grad_norm": 1.5594054460525513, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8822291493415833, + "num_tokens": 441728367.0, + "step": 11572 + }, + { + "epoch": 1.4722045541279736, + "grad_norm": 1.6190111637115479, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.873237133026123, + "num_tokens": 441763068.0, + "step": 11573 + }, + { + "epoch": 1.472331764406564, + "grad_norm": 1.389217734336853, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.870468258857727, + "num_tokens": 441805971.0, + "step": 11574 + }, + { + "epoch": 1.4724589746851546, + "grad_norm": 1.6634869575500488, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.873745322227478, + "num_tokens": 441843453.0, + "step": 11575 + }, + { + "epoch": 1.472586184963745, + "grad_norm": 1.626280426979065, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8670958280563354, + "num_tokens": 441877123.0, + "step": 11576 + }, + { + "epoch": 1.4727133952423355, + "grad_norm": 1.614939570426941, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8748465776443481, + "num_tokens": 441912460.0, + "step": 11577 + }, + { + "epoch": 1.472840605520926, + "grad_norm": 1.3496932983398438, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8863270282745361, + "num_tokens": 441954202.0, + "step": 11578 + }, + { + "epoch": 1.4729678157995165, + "grad_norm": 1.3979172706604004, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8954063653945923, + "num_tokens": 441994553.0, + "step": 11579 + }, + { + "epoch": 1.473095026078107, + "grad_norm": 1.5177406072616577, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8708351850509644, + "num_tokens": 442033031.0, + "step": 11580 + }, + { + "epoch": 1.4732222363566976, + "grad_norm": 1.555701732635498, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8601053953170776, + "num_tokens": 442069496.0, + "step": 11581 + }, + { + "epoch": 1.473349446635288, + "grad_norm": 1.6454799175262451, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.870631992816925, + "num_tokens": 442104124.0, + "step": 11582 + }, + { + "epoch": 1.4734766569138786, + "grad_norm": 1.4773831367492676, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8715741038322449, + "num_tokens": 442145250.0, + "step": 11583 + }, + { + "epoch": 1.4736038671924692, + "grad_norm": 1.536685585975647, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8672887086868286, + "num_tokens": 442180936.0, + "step": 11584 + }, + { + "epoch": 1.4737310774710597, + "grad_norm": 1.2540801763534546, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8892291188240051, + "num_tokens": 442228454.0, + "step": 11585 + }, + { + "epoch": 1.4738582877496502, + "grad_norm": 1.5560791492462158, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8837499022483826, + "num_tokens": 442263058.0, + "step": 11586 + }, + { + "epoch": 1.4739854980282407, + "grad_norm": 1.4613374471664429, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8776689767837524, + "num_tokens": 442304803.0, + "step": 11587 + }, + { + "epoch": 1.4741127083068313, + "grad_norm": 1.5501868724822998, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8761004209518433, + "num_tokens": 442338658.0, + "step": 11588 + }, + { + "epoch": 1.4742399185854218, + "grad_norm": 1.4909641742706299, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8677946329116821, + "num_tokens": 442377388.0, + "step": 11589 + }, + { + "epoch": 1.4743671288640123, + "grad_norm": 1.6017370223999023, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8698590993881226, + "num_tokens": 442414138.0, + "step": 11590 + }, + { + "epoch": 1.4744943391426026, + "grad_norm": 1.5190480947494507, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8863010406494141, + "num_tokens": 442447568.0, + "step": 11591 + }, + { + "epoch": 1.4746215494211932, + "grad_norm": 1.362406611442566, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8855865597724915, + "num_tokens": 442490840.0, + "step": 11592 + }, + { + "epoch": 1.4747487596997837, + "grad_norm": 1.3954787254333496, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8763997554779053, + "num_tokens": 442534998.0, + "step": 11593 + }, + { + "epoch": 1.4748759699783742, + "grad_norm": 1.590956449508667, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8594335317611694, + "num_tokens": 442572838.0, + "step": 11594 + }, + { + "epoch": 1.4750031802569648, + "grad_norm": 1.5402048826217651, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8689804673194885, + "num_tokens": 442606851.0, + "step": 11595 + }, + { + "epoch": 1.4751303905355553, + "grad_norm": 1.5092170238494873, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8813181519508362, + "num_tokens": 442643519.0, + "step": 11596 + }, + { + "epoch": 1.4752576008141458, + "grad_norm": 1.5690923929214478, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8892493844032288, + "num_tokens": 442674573.0, + "step": 11597 + }, + { + "epoch": 1.4753848110927363, + "grad_norm": 1.445793628692627, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8903447389602661, + "num_tokens": 442715179.0, + "step": 11598 + }, + { + "epoch": 1.4755120213713269, + "grad_norm": 1.5047065019607544, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8701640963554382, + "num_tokens": 442752956.0, + "step": 11599 + }, + { + "epoch": 1.4756392316499174, + "grad_norm": 1.5270042419433594, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8784520626068115, + "num_tokens": 442790288.0, + "step": 11600 + }, + { + "epoch": 1.4757664419285077, + "grad_norm": 1.7276583909988403, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8666689395904541, + "num_tokens": 442827040.0, + "step": 11601 + }, + { + "epoch": 1.4758936522070982, + "grad_norm": 1.4903576374053955, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8551231622695923, + "num_tokens": 442870597.0, + "step": 11602 + }, + { + "epoch": 1.4760208624856888, + "grad_norm": 1.5838990211486816, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8744591474533081, + "num_tokens": 442908015.0, + "step": 11603 + }, + { + "epoch": 1.4761480727642793, + "grad_norm": 1.4755403995513916, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8785648345947266, + "num_tokens": 442946680.0, + "step": 11604 + }, + { + "epoch": 1.4762752830428698, + "grad_norm": 1.538899302482605, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8736512660980225, + "num_tokens": 442986640.0, + "step": 11605 + }, + { + "epoch": 1.4764024933214603, + "grad_norm": 1.4847321510314941, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8849185705184937, + "num_tokens": 443024552.0, + "step": 11606 + }, + { + "epoch": 1.4765297036000509, + "grad_norm": 1.4194940328598022, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8843295574188232, + "num_tokens": 443063812.0, + "step": 11607 + }, + { + "epoch": 1.4766569138786414, + "grad_norm": 1.4860519170761108, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8870351314544678, + "num_tokens": 443097586.0, + "step": 11608 + }, + { + "epoch": 1.476784124157232, + "grad_norm": 1.4880568981170654, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8673562407493591, + "num_tokens": 443143530.0, + "step": 11609 + }, + { + "epoch": 1.4769113344358225, + "grad_norm": 1.5310395956039429, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8694430589675903, + "num_tokens": 443182279.0, + "step": 11610 + }, + { + "epoch": 1.477038544714413, + "grad_norm": 1.3547563552856445, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8786731362342834, + "num_tokens": 443225851.0, + "step": 11611 + }, + { + "epoch": 1.4771657549930035, + "grad_norm": 1.5200695991516113, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8912185430526733, + "num_tokens": 443258781.0, + "step": 11612 + }, + { + "epoch": 1.477292965271594, + "grad_norm": 1.5680887699127197, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8813318014144897, + "num_tokens": 443290731.0, + "step": 11613 + }, + { + "epoch": 1.4774201755501846, + "grad_norm": 1.5039818286895752, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8780062198638916, + "num_tokens": 443331835.0, + "step": 11614 + }, + { + "epoch": 1.477547385828775, + "grad_norm": 1.536838412284851, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.881891131401062, + "num_tokens": 443370349.0, + "step": 11615 + }, + { + "epoch": 1.4776745961073654, + "grad_norm": 1.552586317062378, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8758915066719055, + "num_tokens": 443407515.0, + "step": 11616 + }, + { + "epoch": 1.477801806385956, + "grad_norm": 2.3110647201538086, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8918198347091675, + "num_tokens": 443445296.0, + "step": 11617 + }, + { + "epoch": 1.4779290166645465, + "grad_norm": 1.4788508415222168, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8801878094673157, + "num_tokens": 443483615.0, + "step": 11618 + }, + { + "epoch": 1.478056226943137, + "grad_norm": 1.5533074140548706, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8711261749267578, + "num_tokens": 443519522.0, + "step": 11619 + }, + { + "epoch": 1.4781834372217275, + "grad_norm": 1.4039647579193115, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8786200284957886, + "num_tokens": 443560950.0, + "step": 11620 + }, + { + "epoch": 1.478310647500318, + "grad_norm": 1.4094544649124146, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8827346563339233, + "num_tokens": 443600208.0, + "step": 11621 + }, + { + "epoch": 1.4784378577789086, + "grad_norm": 1.5303802490234375, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.87251877784729, + "num_tokens": 443637655.0, + "step": 11622 + }, + { + "epoch": 1.478565068057499, + "grad_norm": 1.6115847826004028, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8818163871765137, + "num_tokens": 443667883.0, + "step": 11623 + }, + { + "epoch": 1.4786922783360896, + "grad_norm": 1.5624364614486694, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8602569699287415, + "num_tokens": 443708370.0, + "step": 11624 + }, + { + "epoch": 1.47881948861468, + "grad_norm": 1.5242019891738892, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8841443061828613, + "num_tokens": 443742352.0, + "step": 11625 + }, + { + "epoch": 1.4789466988932705, + "grad_norm": 1.6792538166046143, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8695241212844849, + "num_tokens": 443779096.0, + "step": 11626 + }, + { + "epoch": 1.479073909171861, + "grad_norm": 1.452934980392456, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8900424242019653, + "num_tokens": 443815426.0, + "step": 11627 + }, + { + "epoch": 1.4792011194504515, + "grad_norm": 1.4907853603363037, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8764998912811279, + "num_tokens": 443854061.0, + "step": 11628 + }, + { + "epoch": 1.479328329729042, + "grad_norm": 1.5345648527145386, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8790183663368225, + "num_tokens": 443889148.0, + "step": 11629 + }, + { + "epoch": 1.4794555400076326, + "grad_norm": 1.4813772439956665, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8767247200012207, + "num_tokens": 443926189.0, + "step": 11630 + }, + { + "epoch": 1.479582750286223, + "grad_norm": 1.5773675441741943, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8685654997825623, + "num_tokens": 443965064.0, + "step": 11631 + }, + { + "epoch": 1.4797099605648136, + "grad_norm": 1.580868124961853, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8740045428276062, + "num_tokens": 443998800.0, + "step": 11632 + }, + { + "epoch": 1.4798371708434042, + "grad_norm": 1.424136996269226, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8788207769393921, + "num_tokens": 444045461.0, + "step": 11633 + }, + { + "epoch": 1.4799643811219947, + "grad_norm": 1.467394471168518, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8892667293548584, + "num_tokens": 444085520.0, + "step": 11634 + }, + { + "epoch": 1.4800915914005852, + "grad_norm": 1.573883295059204, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8883421421051025, + "num_tokens": 444119018.0, + "step": 11635 + }, + { + "epoch": 1.4802188016791757, + "grad_norm": 1.329027771949768, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8698863983154297, + "num_tokens": 444164873.0, + "step": 11636 + }, + { + "epoch": 1.4803460119577663, + "grad_norm": 1.5165079832077026, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.860886812210083, + "num_tokens": 444203570.0, + "step": 11637 + }, + { + "epoch": 1.4804732222363568, + "grad_norm": 1.4086103439331055, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8583343029022217, + "num_tokens": 444250069.0, + "step": 11638 + }, + { + "epoch": 1.4806004325149473, + "grad_norm": 1.49436354637146, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8677421808242798, + "num_tokens": 444291396.0, + "step": 11639 + }, + { + "epoch": 1.4807276427935376, + "grad_norm": 1.483312964439392, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8773618340492249, + "num_tokens": 444327923.0, + "step": 11640 + }, + { + "epoch": 1.4808548530721282, + "grad_norm": 1.5192999839782715, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.860185980796814, + "num_tokens": 444371637.0, + "step": 11641 + }, + { + "epoch": 1.4809820633507187, + "grad_norm": 1.6059435606002808, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8835821151733398, + "num_tokens": 444407084.0, + "step": 11642 + }, + { + "epoch": 1.4811092736293092, + "grad_norm": 1.5337328910827637, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8839515447616577, + "num_tokens": 444442286.0, + "step": 11643 + }, + { + "epoch": 1.4812364839078997, + "grad_norm": 1.5513091087341309, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8621037006378174, + "num_tokens": 444477838.0, + "step": 11644 + }, + { + "epoch": 1.4813636941864903, + "grad_norm": 1.491240382194519, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8809159994125366, + "num_tokens": 444517393.0, + "step": 11645 + }, + { + "epoch": 1.4814909044650808, + "grad_norm": 1.5127688646316528, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8765696883201599, + "num_tokens": 444554636.0, + "step": 11646 + }, + { + "epoch": 1.4816181147436713, + "grad_norm": 1.4995596408843994, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8739769458770752, + "num_tokens": 444592739.0, + "step": 11647 + }, + { + "epoch": 1.4817453250222619, + "grad_norm": 1.5722683668136597, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8777676820755005, + "num_tokens": 444627879.0, + "step": 11648 + }, + { + "epoch": 1.4818725353008524, + "grad_norm": 1.624036192893982, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8735487461090088, + "num_tokens": 444660924.0, + "step": 11649 + }, + { + "epoch": 1.4819997455794427, + "grad_norm": 1.5039387941360474, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8777661919593811, + "num_tokens": 444698846.0, + "step": 11650 + }, + { + "epoch": 1.4821269558580332, + "grad_norm": 1.5852562189102173, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8869981169700623, + "num_tokens": 444731635.0, + "step": 11651 + }, + { + "epoch": 1.4822541661366238, + "grad_norm": 1.6351690292358398, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8712165951728821, + "num_tokens": 444766809.0, + "step": 11652 + }, + { + "epoch": 1.4823813764152143, + "grad_norm": 1.5153793096542358, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.882235586643219, + "num_tokens": 444801972.0, + "step": 11653 + }, + { + "epoch": 1.4825085866938048, + "grad_norm": 1.5697354078292847, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8774877786636353, + "num_tokens": 444837214.0, + "step": 11654 + }, + { + "epoch": 1.4826357969723953, + "grad_norm": 1.570702314376831, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8798838257789612, + "num_tokens": 444870847.0, + "step": 11655 + }, + { + "epoch": 1.4827630072509859, + "grad_norm": 1.5190472602844238, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8901218175888062, + "num_tokens": 444907528.0, + "step": 11656 + }, + { + "epoch": 1.4828902175295764, + "grad_norm": 1.5485608577728271, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8793342113494873, + "num_tokens": 444942394.0, + "step": 11657 + }, + { + "epoch": 1.483017427808167, + "grad_norm": 1.595672845840454, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8715349435806274, + "num_tokens": 444975356.0, + "step": 11658 + }, + { + "epoch": 1.4831446380867574, + "grad_norm": 1.5325641632080078, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8871703743934631, + "num_tokens": 445014059.0, + "step": 11659 + }, + { + "epoch": 1.483271848365348, + "grad_norm": 1.444392442703247, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8840261697769165, + "num_tokens": 445052303.0, + "step": 11660 + }, + { + "epoch": 1.4833990586439385, + "grad_norm": 1.5558336973190308, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8688238859176636, + "num_tokens": 445089099.0, + "step": 11661 + }, + { + "epoch": 1.483526268922529, + "grad_norm": 1.5494953393936157, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8666631579399109, + "num_tokens": 445124574.0, + "step": 11662 + }, + { + "epoch": 1.4836534792011196, + "grad_norm": 1.5583903789520264, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8693351745605469, + "num_tokens": 445160897.0, + "step": 11663 + }, + { + "epoch": 1.48378068947971, + "grad_norm": 1.5509231090545654, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8757280111312866, + "num_tokens": 445195620.0, + "step": 11664 + }, + { + "epoch": 1.4839078997583004, + "grad_norm": 1.349984884262085, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8814979791641235, + "num_tokens": 445240999.0, + "step": 11665 + }, + { + "epoch": 1.484035110036891, + "grad_norm": 1.3472341299057007, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8919164538383484, + "num_tokens": 445285278.0, + "step": 11666 + }, + { + "epoch": 1.4841623203154815, + "grad_norm": 1.467995285987854, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8856012225151062, + "num_tokens": 445327966.0, + "step": 11667 + }, + { + "epoch": 1.484289530594072, + "grad_norm": 1.5088719129562378, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8776569366455078, + "num_tokens": 445365697.0, + "step": 11668 + }, + { + "epoch": 1.4844167408726625, + "grad_norm": 1.5509668588638306, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8786556720733643, + "num_tokens": 445400751.0, + "step": 11669 + }, + { + "epoch": 1.484543951151253, + "grad_norm": 1.474939227104187, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8739585876464844, + "num_tokens": 445441568.0, + "step": 11670 + }, + { + "epoch": 1.4846711614298436, + "grad_norm": 1.4544814825057983, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8665963411331177, + "num_tokens": 445483941.0, + "step": 11671 + }, + { + "epoch": 1.484798371708434, + "grad_norm": 1.477992057800293, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8755657076835632, + "num_tokens": 445523130.0, + "step": 11672 + }, + { + "epoch": 1.4849255819870246, + "grad_norm": 1.6592113971710205, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8585143089294434, + "num_tokens": 445563622.0, + "step": 11673 + }, + { + "epoch": 1.485052792265615, + "grad_norm": 1.5933351516723633, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8660944700241089, + "num_tokens": 445600098.0, + "step": 11674 + }, + { + "epoch": 1.4851800025442055, + "grad_norm": 1.6867690086364746, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.865930438041687, + "num_tokens": 445636772.0, + "step": 11675 + }, + { + "epoch": 1.485307212822796, + "grad_norm": 1.5989506244659424, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8603016138076782, + "num_tokens": 445672990.0, + "step": 11676 + }, + { + "epoch": 1.4854344231013865, + "grad_norm": 1.6505964994430542, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8750267028808594, + "num_tokens": 445706440.0, + "step": 11677 + }, + { + "epoch": 1.485561633379977, + "grad_norm": 1.614161491394043, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8792160749435425, + "num_tokens": 445743831.0, + "step": 11678 + }, + { + "epoch": 1.4856888436585676, + "grad_norm": 1.651674509048462, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8629836440086365, + "num_tokens": 445778682.0, + "step": 11679 + }, + { + "epoch": 1.485816053937158, + "grad_norm": 1.5170806646347046, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8775218725204468, + "num_tokens": 445815607.0, + "step": 11680 + }, + { + "epoch": 1.4859432642157486, + "grad_norm": 1.634498953819275, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8749490976333618, + "num_tokens": 445850598.0, + "step": 11681 + }, + { + "epoch": 1.4860704744943392, + "grad_norm": 1.3653002977371216, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8875520825386047, + "num_tokens": 445896095.0, + "step": 11682 + }, + { + "epoch": 1.4861976847729297, + "grad_norm": 1.399619221687317, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8916712999343872, + "num_tokens": 445941344.0, + "step": 11683 + }, + { + "epoch": 1.4863248950515202, + "grad_norm": 1.6129094362258911, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8840343952178955, + "num_tokens": 445975937.0, + "step": 11684 + }, + { + "epoch": 1.4864521053301107, + "grad_norm": 1.365707278251648, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8934051990509033, + "num_tokens": 446016791.0, + "step": 11685 + }, + { + "epoch": 1.4865793156087013, + "grad_norm": 1.3818055391311646, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8777210712432861, + "num_tokens": 446059248.0, + "step": 11686 + }, + { + "epoch": 1.4867065258872918, + "grad_norm": 1.547592043876648, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8609491586685181, + "num_tokens": 446100022.0, + "step": 11687 + }, + { + "epoch": 1.4868337361658823, + "grad_norm": 1.5777168273925781, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8682544231414795, + "num_tokens": 446135920.0, + "step": 11688 + }, + { + "epoch": 1.4869609464444726, + "grad_norm": 1.4204683303833008, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8843295574188232, + "num_tokens": 446173568.0, + "step": 11689 + }, + { + "epoch": 1.4870881567230632, + "grad_norm": 1.3514602184295654, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8854195475578308, + "num_tokens": 446217708.0, + "step": 11690 + }, + { + "epoch": 1.4872153670016537, + "grad_norm": 1.6633027791976929, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8669251799583435, + "num_tokens": 446252205.0, + "step": 11691 + }, + { + "epoch": 1.4873425772802442, + "grad_norm": 1.5741891860961914, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8746545314788818, + "num_tokens": 446286312.0, + "step": 11692 + }, + { + "epoch": 1.4874697875588347, + "grad_norm": 1.5281347036361694, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8836691379547119, + "num_tokens": 446324099.0, + "step": 11693 + }, + { + "epoch": 1.4875969978374253, + "grad_norm": 1.3485292196273804, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8791208267211914, + "num_tokens": 446366277.0, + "step": 11694 + }, + { + "epoch": 1.4877242081160158, + "grad_norm": 1.3976644277572632, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8823761940002441, + "num_tokens": 446406070.0, + "step": 11695 + }, + { + "epoch": 1.4878514183946063, + "grad_norm": 1.6565229892730713, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8665248155593872, + "num_tokens": 446439181.0, + "step": 11696 + }, + { + "epoch": 1.4879786286731969, + "grad_norm": 1.748753309249878, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8728480339050293, + "num_tokens": 446475999.0, + "step": 11697 + }, + { + "epoch": 1.4881058389517874, + "grad_norm": 1.5040667057037354, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.88283771276474, + "num_tokens": 446515254.0, + "step": 11698 + }, + { + "epoch": 1.4882330492303777, + "grad_norm": 1.5045742988586426, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8769797086715698, + "num_tokens": 446552996.0, + "step": 11699 + }, + { + "epoch": 1.4883602595089682, + "grad_norm": 1.3131479024887085, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8887686729431152, + "num_tokens": 446594707.0, + "step": 11700 + }, + { + "epoch": 1.4884874697875587, + "grad_norm": 1.4897123575210571, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8735175132751465, + "num_tokens": 446633032.0, + "step": 11701 + }, + { + "epoch": 1.4886146800661493, + "grad_norm": 1.4659899473190308, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8646196126937866, + "num_tokens": 446673475.0, + "step": 11702 + }, + { + "epoch": 1.4887418903447398, + "grad_norm": 1.6330840587615967, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8837401866912842, + "num_tokens": 446705336.0, + "step": 11703 + }, + { + "epoch": 1.4888691006233303, + "grad_norm": 1.5012961626052856, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8739056587219238, + "num_tokens": 446741423.0, + "step": 11704 + }, + { + "epoch": 1.4889963109019209, + "grad_norm": 1.5426205396652222, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8732551336288452, + "num_tokens": 446777159.0, + "step": 11705 + }, + { + "epoch": 1.4891235211805114, + "grad_norm": 1.4862085580825806, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.86463463306427, + "num_tokens": 446820583.0, + "step": 11706 + }, + { + "epoch": 1.489250731459102, + "grad_norm": 1.487500786781311, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8876601457595825, + "num_tokens": 446863216.0, + "step": 11707 + }, + { + "epoch": 1.4893779417376924, + "grad_norm": 1.5009374618530273, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8799235224723816, + "num_tokens": 446900224.0, + "step": 11708 + }, + { + "epoch": 1.489505152016283, + "grad_norm": 1.619864583015442, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8806552886962891, + "num_tokens": 446933144.0, + "step": 11709 + }, + { + "epoch": 1.4896323622948735, + "grad_norm": 1.479474425315857, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8805162310600281, + "num_tokens": 446969732.0, + "step": 11710 + }, + { + "epoch": 1.489759572573464, + "grad_norm": 1.4930167198181152, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8723260164260864, + "num_tokens": 447011064.0, + "step": 11711 + }, + { + "epoch": 1.4898867828520546, + "grad_norm": 1.6617431640625, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8692582845687866, + "num_tokens": 447050877.0, + "step": 11712 + }, + { + "epoch": 1.490013993130645, + "grad_norm": 1.3797149658203125, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8865856528282166, + "num_tokens": 447089854.0, + "step": 11713 + }, + { + "epoch": 1.4901412034092354, + "grad_norm": 1.521990418434143, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8762571811676025, + "num_tokens": 447126290.0, + "step": 11714 + }, + { + "epoch": 1.490268413687826, + "grad_norm": 1.4849475622177124, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8758922219276428, + "num_tokens": 447161893.0, + "step": 11715 + }, + { + "epoch": 1.4903956239664164, + "grad_norm": 1.5406787395477295, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8586629629135132, + "num_tokens": 447199005.0, + "step": 11716 + }, + { + "epoch": 1.490522834245007, + "grad_norm": 1.4972591400146484, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8834537863731384, + "num_tokens": 447232799.0, + "step": 11717 + }, + { + "epoch": 1.4906500445235975, + "grad_norm": 1.4992034435272217, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8751646280288696, + "num_tokens": 447267177.0, + "step": 11718 + }, + { + "epoch": 1.490777254802188, + "grad_norm": 1.583272099494934, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8522713780403137, + "num_tokens": 447308495.0, + "step": 11719 + }, + { + "epoch": 1.4909044650807786, + "grad_norm": 1.4778884649276733, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8790296912193298, + "num_tokens": 447349042.0, + "step": 11720 + }, + { + "epoch": 1.491031675359369, + "grad_norm": 1.3797173500061035, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8745119571685791, + "num_tokens": 447392195.0, + "step": 11721 + }, + { + "epoch": 1.4911588856379596, + "grad_norm": 1.4888288974761963, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8901341557502747, + "num_tokens": 447426791.0, + "step": 11722 + }, + { + "epoch": 1.49128609591655, + "grad_norm": 1.4046697616577148, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8837714195251465, + "num_tokens": 447466407.0, + "step": 11723 + }, + { + "epoch": 1.4914133061951405, + "grad_norm": 1.3993618488311768, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8693413734436035, + "num_tokens": 447511431.0, + "step": 11724 + }, + { + "epoch": 1.491540516473731, + "grad_norm": 1.4905821084976196, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8805392384529114, + "num_tokens": 447549171.0, + "step": 11725 + }, + { + "epoch": 1.4916677267523215, + "grad_norm": 1.365965485572815, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8812837600708008, + "num_tokens": 447592731.0, + "step": 11726 + }, + { + "epoch": 1.491794937030912, + "grad_norm": 1.5392459630966187, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8760258555412292, + "num_tokens": 447633961.0, + "step": 11727 + }, + { + "epoch": 1.4919221473095026, + "grad_norm": 1.6018800735473633, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8621344566345215, + "num_tokens": 447673697.0, + "step": 11728 + }, + { + "epoch": 1.492049357588093, + "grad_norm": 1.5849069356918335, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.872471034526825, + "num_tokens": 447709460.0, + "step": 11729 + }, + { + "epoch": 1.4921765678666836, + "grad_norm": 1.441637635231018, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8752738237380981, + "num_tokens": 447752975.0, + "step": 11730 + }, + { + "epoch": 1.4923037781452742, + "grad_norm": 1.5464599132537842, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8601653575897217, + "num_tokens": 447794379.0, + "step": 11731 + }, + { + "epoch": 1.4924309884238647, + "grad_norm": 1.638214111328125, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8801319599151611, + "num_tokens": 447827034.0, + "step": 11732 + }, + { + "epoch": 1.4925581987024552, + "grad_norm": 1.6038451194763184, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8505489826202393, + "num_tokens": 447865881.0, + "step": 11733 + }, + { + "epoch": 1.4926854089810457, + "grad_norm": 1.5297141075134277, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8822665214538574, + "num_tokens": 447900988.0, + "step": 11734 + }, + { + "epoch": 1.4928126192596363, + "grad_norm": 1.5152639150619507, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8719791173934937, + "num_tokens": 447940361.0, + "step": 11735 + }, + { + "epoch": 1.4929398295382268, + "grad_norm": 1.449527382850647, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8735498189926147, + "num_tokens": 447980511.0, + "step": 11736 + }, + { + "epoch": 1.4930670398168173, + "grad_norm": 1.5195140838623047, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.877385675907135, + "num_tokens": 448021303.0, + "step": 11737 + }, + { + "epoch": 1.4931942500954076, + "grad_norm": 1.6278971433639526, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8689398765563965, + "num_tokens": 448053579.0, + "step": 11738 + }, + { + "epoch": 1.4933214603739982, + "grad_norm": 1.5245190858840942, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8740779757499695, + "num_tokens": 448089952.0, + "step": 11739 + }, + { + "epoch": 1.4934486706525887, + "grad_norm": 1.4901537895202637, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8737821578979492, + "num_tokens": 448125790.0, + "step": 11740 + }, + { + "epoch": 1.4935758809311792, + "grad_norm": 1.7739105224609375, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8713778853416443, + "num_tokens": 448156232.0, + "step": 11741 + }, + { + "epoch": 1.4937030912097697, + "grad_norm": 1.5586752891540527, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8844866156578064, + "num_tokens": 448191223.0, + "step": 11742 + }, + { + "epoch": 1.4938303014883603, + "grad_norm": 1.4126431941986084, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8818826675415039, + "num_tokens": 448231878.0, + "step": 11743 + }, + { + "epoch": 1.4939575117669508, + "grad_norm": 1.5979630947113037, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8684265613555908, + "num_tokens": 448267323.0, + "step": 11744 + }, + { + "epoch": 1.4940847220455413, + "grad_norm": 1.4956579208374023, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8731803297996521, + "num_tokens": 448303287.0, + "step": 11745 + }, + { + "epoch": 1.4942119323241319, + "grad_norm": 1.4140844345092773, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8730981945991516, + "num_tokens": 448344835.0, + "step": 11746 + }, + { + "epoch": 1.4943391426027224, + "grad_norm": 1.4765441417694092, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8876312971115112, + "num_tokens": 448382458.0, + "step": 11747 + }, + { + "epoch": 1.4944663528813127, + "grad_norm": 1.5184779167175293, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8806585073471069, + "num_tokens": 448421155.0, + "step": 11748 + }, + { + "epoch": 1.4945935631599032, + "grad_norm": 1.5220462083816528, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8750916719436646, + "num_tokens": 448457220.0, + "step": 11749 + }, + { + "epoch": 1.4947207734384937, + "grad_norm": 1.4762295484542847, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8782361745834351, + "num_tokens": 448495923.0, + "step": 11750 + }, + { + "epoch": 1.4948479837170843, + "grad_norm": 1.57068932056427, + "learning_rate": 1e-06, + "loss": 0.2866, + "mean_token_accuracy": 0.8964669704437256, + "num_tokens": 448527374.0, + "step": 11751 + }, + { + "epoch": 1.4949751939956748, + "grad_norm": 1.4397523403167725, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8774831295013428, + "num_tokens": 448570751.0, + "step": 11752 + }, + { + "epoch": 1.4951024042742653, + "grad_norm": 1.5196105241775513, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8560812473297119, + "num_tokens": 448610941.0, + "step": 11753 + }, + { + "epoch": 1.4952296145528559, + "grad_norm": 1.6086896657943726, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8740248680114746, + "num_tokens": 448643615.0, + "step": 11754 + }, + { + "epoch": 1.4953568248314464, + "grad_norm": 1.5127086639404297, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8648768067359924, + "num_tokens": 448681952.0, + "step": 11755 + }, + { + "epoch": 1.495484035110037, + "grad_norm": 1.5284390449523926, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8660974502563477, + "num_tokens": 448721867.0, + "step": 11756 + }, + { + "epoch": 1.4956112453886274, + "grad_norm": 1.5315507650375366, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8721773624420166, + "num_tokens": 448760691.0, + "step": 11757 + }, + { + "epoch": 1.495738455667218, + "grad_norm": 1.4696106910705566, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8679684996604919, + "num_tokens": 448803300.0, + "step": 11758 + }, + { + "epoch": 1.4958656659458085, + "grad_norm": 1.5292571783065796, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8745179176330566, + "num_tokens": 448844629.0, + "step": 11759 + }, + { + "epoch": 1.495992876224399, + "grad_norm": 1.6182199716567993, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8721097707748413, + "num_tokens": 448880710.0, + "step": 11760 + }, + { + "epoch": 1.4961200865029896, + "grad_norm": 1.6464171409606934, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8684079647064209, + "num_tokens": 448914939.0, + "step": 11761 + }, + { + "epoch": 1.49624729678158, + "grad_norm": 1.4477332830429077, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8705971240997314, + "num_tokens": 448955069.0, + "step": 11762 + }, + { + "epoch": 1.4963745070601704, + "grad_norm": 1.484618067741394, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8731139898300171, + "num_tokens": 448993350.0, + "step": 11763 + }, + { + "epoch": 1.496501717338761, + "grad_norm": 1.537322759628296, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8729822635650635, + "num_tokens": 449031509.0, + "step": 11764 + }, + { + "epoch": 1.4966289276173514, + "grad_norm": 1.661268711090088, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8477798700332642, + "num_tokens": 449070226.0, + "step": 11765 + }, + { + "epoch": 1.496756137895942, + "grad_norm": 1.5285577774047852, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8808650970458984, + "num_tokens": 449109291.0, + "step": 11766 + }, + { + "epoch": 1.4968833481745325, + "grad_norm": 1.6443113088607788, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8711305260658264, + "num_tokens": 449141386.0, + "step": 11767 + }, + { + "epoch": 1.497010558453123, + "grad_norm": 1.611770749092102, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8867397308349609, + "num_tokens": 449172815.0, + "step": 11768 + }, + { + "epoch": 1.4971377687317136, + "grad_norm": 1.5318360328674316, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8618123531341553, + "num_tokens": 449210748.0, + "step": 11769 + }, + { + "epoch": 1.497264979010304, + "grad_norm": 1.4607877731323242, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8725515007972717, + "num_tokens": 449251250.0, + "step": 11770 + }, + { + "epoch": 1.4973921892888946, + "grad_norm": 1.5607374906539917, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8740852475166321, + "num_tokens": 449285201.0, + "step": 11771 + }, + { + "epoch": 1.497519399567485, + "grad_norm": 1.524107575416565, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8684353828430176, + "num_tokens": 449326948.0, + "step": 11772 + }, + { + "epoch": 1.4976466098460754, + "grad_norm": 1.6312074661254883, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8623318672180176, + "num_tokens": 449369898.0, + "step": 11773 + }, + { + "epoch": 1.497773820124666, + "grad_norm": 1.5349280834197998, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8738207817077637, + "num_tokens": 449406623.0, + "step": 11774 + }, + { + "epoch": 1.4979010304032565, + "grad_norm": 1.6146228313446045, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8629245758056641, + "num_tokens": 449441661.0, + "step": 11775 + }, + { + "epoch": 1.498028240681847, + "grad_norm": 1.5701839923858643, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8852221965789795, + "num_tokens": 449477821.0, + "step": 11776 + }, + { + "epoch": 1.4981554509604376, + "grad_norm": 1.5854287147521973, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8894667625427246, + "num_tokens": 449511769.0, + "step": 11777 + }, + { + "epoch": 1.498282661239028, + "grad_norm": 1.6348010301589966, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8725113272666931, + "num_tokens": 449548340.0, + "step": 11778 + }, + { + "epoch": 1.4984098715176186, + "grad_norm": 1.5709521770477295, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8693337440490723, + "num_tokens": 449584330.0, + "step": 11779 + }, + { + "epoch": 1.4985370817962091, + "grad_norm": 1.417004942893982, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.878090500831604, + "num_tokens": 449626786.0, + "step": 11780 + }, + { + "epoch": 1.4986642920747997, + "grad_norm": 1.5265344381332397, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8893316388130188, + "num_tokens": 449660702.0, + "step": 11781 + }, + { + "epoch": 1.4987915023533902, + "grad_norm": 1.4617114067077637, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8706188201904297, + "num_tokens": 449699565.0, + "step": 11782 + }, + { + "epoch": 1.4989187126319807, + "grad_norm": 1.5508379936218262, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8731201887130737, + "num_tokens": 449733224.0, + "step": 11783 + }, + { + "epoch": 1.4990459229105713, + "grad_norm": 1.5198405981063843, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8829701542854309, + "num_tokens": 449767374.0, + "step": 11784 + }, + { + "epoch": 1.4991731331891618, + "grad_norm": 1.5372015237808228, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8750571608543396, + "num_tokens": 449805035.0, + "step": 11785 + }, + { + "epoch": 1.4993003434677523, + "grad_norm": 1.5071102380752563, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8761724233627319, + "num_tokens": 449843019.0, + "step": 11786 + }, + { + "epoch": 1.4994275537463426, + "grad_norm": 1.449917197227478, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8820377588272095, + "num_tokens": 449879848.0, + "step": 11787 + }, + { + "epoch": 1.4995547640249332, + "grad_norm": 1.5940043926239014, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8723083734512329, + "num_tokens": 449921429.0, + "step": 11788 + }, + { + "epoch": 1.4996819743035237, + "grad_norm": 1.645824670791626, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.87464439868927, + "num_tokens": 449952513.0, + "step": 11789 + }, + { + "epoch": 1.4998091845821142, + "grad_norm": 1.4851771593093872, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8873718976974487, + "num_tokens": 449987997.0, + "step": 11790 + }, + { + "epoch": 1.4999363948607047, + "grad_norm": 1.5933655500411987, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8717287182807922, + "num_tokens": 450029626.0, + "step": 11791 + }, + { + "epoch": 1.5000636051392953, + "grad_norm": 1.436598539352417, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8828537464141846, + "num_tokens": 450072847.0, + "step": 11792 + }, + { + "epoch": 1.5001908154178858, + "grad_norm": 1.5325700044631958, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8744139671325684, + "num_tokens": 450111560.0, + "step": 11793 + }, + { + "epoch": 1.5003180256964763, + "grad_norm": 1.3582309484481812, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.880239725112915, + "num_tokens": 450154687.0, + "step": 11794 + }, + { + "epoch": 1.5004452359750666, + "grad_norm": 1.5007010698318481, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8693006634712219, + "num_tokens": 450195885.0, + "step": 11795 + }, + { + "epoch": 1.5005724462536572, + "grad_norm": 1.3516966104507446, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8865916132926941, + "num_tokens": 450236797.0, + "step": 11796 + }, + { + "epoch": 1.5006996565322477, + "grad_norm": 1.6015158891677856, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8614935278892517, + "num_tokens": 450275949.0, + "step": 11797 + }, + { + "epoch": 1.5008268668108382, + "grad_norm": 1.5304025411605835, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8873298168182373, + "num_tokens": 450310215.0, + "step": 11798 + }, + { + "epoch": 1.5009540770894287, + "grad_norm": 1.5079456567764282, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8472846150398254, + "num_tokens": 450354892.0, + "step": 11799 + }, + { + "epoch": 1.5010812873680193, + "grad_norm": 1.3797293901443481, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8803367614746094, + "num_tokens": 450397818.0, + "step": 11800 + }, + { + "epoch": 1.5012084976466098, + "grad_norm": 1.5320886373519897, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8804440498352051, + "num_tokens": 450432239.0, + "step": 11801 + }, + { + "epoch": 1.5013357079252003, + "grad_norm": 1.4765533208847046, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.882972002029419, + "num_tokens": 450472606.0, + "step": 11802 + }, + { + "epoch": 1.5014629182037909, + "grad_norm": 1.50591242313385, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.862669825553894, + "num_tokens": 450513039.0, + "step": 11803 + }, + { + "epoch": 1.5015901284823814, + "grad_norm": 1.5167182683944702, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8695826530456543, + "num_tokens": 450551207.0, + "step": 11804 + }, + { + "epoch": 1.501717338760972, + "grad_norm": 1.5050259828567505, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8829482197761536, + "num_tokens": 450586095.0, + "step": 11805 + }, + { + "epoch": 1.5018445490395624, + "grad_norm": 1.4770543575286865, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.859447181224823, + "num_tokens": 450625500.0, + "step": 11806 + }, + { + "epoch": 1.501971759318153, + "grad_norm": 1.4530470371246338, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.883655309677124, + "num_tokens": 450664310.0, + "step": 11807 + }, + { + "epoch": 1.5020989695967435, + "grad_norm": 1.3297700881958008, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8806986212730408, + "num_tokens": 450708504.0, + "step": 11808 + }, + { + "epoch": 1.502226179875334, + "grad_norm": 1.3432456254959106, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8880858421325684, + "num_tokens": 450750452.0, + "step": 11809 + }, + { + "epoch": 1.5023533901539246, + "grad_norm": 1.5928982496261597, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8874136209487915, + "num_tokens": 450786052.0, + "step": 11810 + }, + { + "epoch": 1.502480600432515, + "grad_norm": 1.5498031377792358, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.870673418045044, + "num_tokens": 450822863.0, + "step": 11811 + }, + { + "epoch": 1.5026078107111056, + "grad_norm": 1.4777421951293945, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8877876996994019, + "num_tokens": 450855441.0, + "step": 11812 + }, + { + "epoch": 1.502735020989696, + "grad_norm": 1.6913797855377197, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8635513782501221, + "num_tokens": 450889617.0, + "step": 11813 + }, + { + "epoch": 1.5028622312682864, + "grad_norm": 1.580983281135559, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8718435168266296, + "num_tokens": 450924216.0, + "step": 11814 + }, + { + "epoch": 1.502989441546877, + "grad_norm": 1.4800958633422852, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8839721083641052, + "num_tokens": 450961476.0, + "step": 11815 + }, + { + "epoch": 1.5031166518254675, + "grad_norm": 1.528118371963501, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.878097653388977, + "num_tokens": 451000915.0, + "step": 11816 + }, + { + "epoch": 1.503243862104058, + "grad_norm": 1.5074290037155151, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8756287097930908, + "num_tokens": 451041037.0, + "step": 11817 + }, + { + "epoch": 1.5033710723826486, + "grad_norm": 1.523082971572876, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8678788542747498, + "num_tokens": 451086215.0, + "step": 11818 + }, + { + "epoch": 1.503498282661239, + "grad_norm": 1.4477802515029907, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8856162428855896, + "num_tokens": 451124457.0, + "step": 11819 + }, + { + "epoch": 1.5036254929398294, + "grad_norm": 1.5336140394210815, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8717230558395386, + "num_tokens": 451164154.0, + "step": 11820 + }, + { + "epoch": 1.50375270321842, + "grad_norm": 1.5088218450546265, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8795986175537109, + "num_tokens": 451202033.0, + "step": 11821 + }, + { + "epoch": 1.5038799134970104, + "grad_norm": 1.548161506652832, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8772637844085693, + "num_tokens": 451237480.0, + "step": 11822 + }, + { + "epoch": 1.504007123775601, + "grad_norm": 1.4182474613189697, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8937758207321167, + "num_tokens": 451277089.0, + "step": 11823 + }, + { + "epoch": 1.5041343340541915, + "grad_norm": 1.4230470657348633, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.869741678237915, + "num_tokens": 451318240.0, + "step": 11824 + }, + { + "epoch": 1.504261544332782, + "grad_norm": 1.5727040767669678, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8807922601699829, + "num_tokens": 451358847.0, + "step": 11825 + }, + { + "epoch": 1.5043887546113726, + "grad_norm": 1.608305811882019, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8666312098503113, + "num_tokens": 451398292.0, + "step": 11826 + }, + { + "epoch": 1.504515964889963, + "grad_norm": 1.587134838104248, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8688259124755859, + "num_tokens": 451439508.0, + "step": 11827 + }, + { + "epoch": 1.5046431751685536, + "grad_norm": 1.4595288038253784, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8819687962532043, + "num_tokens": 451477491.0, + "step": 11828 + }, + { + "epoch": 1.5047703854471441, + "grad_norm": 1.5827921628952026, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8672668933868408, + "num_tokens": 451514213.0, + "step": 11829 + }, + { + "epoch": 1.5048975957257347, + "grad_norm": 1.6044714450836182, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8717698454856873, + "num_tokens": 451548609.0, + "step": 11830 + }, + { + "epoch": 1.5050248060043252, + "grad_norm": 1.4127371311187744, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.885025143623352, + "num_tokens": 451587698.0, + "step": 11831 + }, + { + "epoch": 1.5051520162829157, + "grad_norm": 1.4291062355041504, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8772469758987427, + "num_tokens": 451628744.0, + "step": 11832 + }, + { + "epoch": 1.5052792265615063, + "grad_norm": 1.566280722618103, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8738317489624023, + "num_tokens": 451663737.0, + "step": 11833 + }, + { + "epoch": 1.5054064368400968, + "grad_norm": 1.499182939529419, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8774727582931519, + "num_tokens": 451698507.0, + "step": 11834 + }, + { + "epoch": 1.5055336471186873, + "grad_norm": 1.5387834310531616, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8600012063980103, + "num_tokens": 451733531.0, + "step": 11835 + }, + { + "epoch": 1.5056608573972778, + "grad_norm": 1.4758421182632446, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8820104598999023, + "num_tokens": 451770811.0, + "step": 11836 + }, + { + "epoch": 1.5057880676758684, + "grad_norm": 1.7056772708892822, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8668071031570435, + "num_tokens": 451800756.0, + "step": 11837 + }, + { + "epoch": 1.5059152779544587, + "grad_norm": 1.4533300399780273, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8841603994369507, + "num_tokens": 451837995.0, + "step": 11838 + }, + { + "epoch": 1.5060424882330492, + "grad_norm": 1.5407761335372925, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8570494651794434, + "num_tokens": 451878858.0, + "step": 11839 + }, + { + "epoch": 1.5061696985116397, + "grad_norm": 1.527315378189087, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8836023807525635, + "num_tokens": 451913316.0, + "step": 11840 + }, + { + "epoch": 1.5062969087902303, + "grad_norm": 1.4931213855743408, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8841985464096069, + "num_tokens": 451950686.0, + "step": 11841 + }, + { + "epoch": 1.5064241190688208, + "grad_norm": 1.5765693187713623, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8777350187301636, + "num_tokens": 451983341.0, + "step": 11842 + }, + { + "epoch": 1.5065513293474113, + "grad_norm": 1.5379210710525513, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8694462776184082, + "num_tokens": 452024817.0, + "step": 11843 + }, + { + "epoch": 1.5066785396260016, + "grad_norm": 1.6083106994628906, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.871938169002533, + "num_tokens": 452062670.0, + "step": 11844 + }, + { + "epoch": 1.5068057499045922, + "grad_norm": 1.433488130569458, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8797633647918701, + "num_tokens": 452105899.0, + "step": 11845 + }, + { + "epoch": 1.5069329601831827, + "grad_norm": 1.4914946556091309, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8721716403961182, + "num_tokens": 452146375.0, + "step": 11846 + }, + { + "epoch": 1.5070601704617732, + "grad_norm": 1.4396436214447021, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8799214363098145, + "num_tokens": 452187660.0, + "step": 11847 + }, + { + "epoch": 1.5071873807403637, + "grad_norm": 1.4861712455749512, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8605269193649292, + "num_tokens": 452232480.0, + "step": 11848 + }, + { + "epoch": 1.5073145910189543, + "grad_norm": 1.4717237949371338, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8780430555343628, + "num_tokens": 452273383.0, + "step": 11849 + }, + { + "epoch": 1.5074418012975448, + "grad_norm": 1.484596848487854, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8780192136764526, + "num_tokens": 452311853.0, + "step": 11850 + }, + { + "epoch": 1.5075690115761353, + "grad_norm": 1.422895073890686, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8783259987831116, + "num_tokens": 452353400.0, + "step": 11851 + }, + { + "epoch": 1.5076962218547258, + "grad_norm": 1.6256396770477295, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8594988584518433, + "num_tokens": 452389454.0, + "step": 11852 + }, + { + "epoch": 1.5078234321333164, + "grad_norm": 1.6097275018692017, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8654890656471252, + "num_tokens": 452424268.0, + "step": 11853 + }, + { + "epoch": 1.507950642411907, + "grad_norm": 1.433531403541565, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8811428546905518, + "num_tokens": 452461475.0, + "step": 11854 + }, + { + "epoch": 1.5080778526904974, + "grad_norm": 1.5114182233810425, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8664440512657166, + "num_tokens": 452500842.0, + "step": 11855 + }, + { + "epoch": 1.508205062969088, + "grad_norm": 1.4404654502868652, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8753576874732971, + "num_tokens": 452542436.0, + "step": 11856 + }, + { + "epoch": 1.5083322732476785, + "grad_norm": 1.4446797370910645, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8742238283157349, + "num_tokens": 452582634.0, + "step": 11857 + }, + { + "epoch": 1.508459483526269, + "grad_norm": 1.5922914743423462, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8662480115890503, + "num_tokens": 452618940.0, + "step": 11858 + }, + { + "epoch": 1.5085866938048595, + "grad_norm": 1.4671275615692139, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8777154684066772, + "num_tokens": 452657407.0, + "step": 11859 + }, + { + "epoch": 1.50871390408345, + "grad_norm": 1.600989818572998, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8605489730834961, + "num_tokens": 452692472.0, + "step": 11860 + }, + { + "epoch": 1.5088411143620406, + "grad_norm": 1.5022063255310059, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8863275051116943, + "num_tokens": 452725829.0, + "step": 11861 + }, + { + "epoch": 1.508968324640631, + "grad_norm": 1.5744552612304688, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8814232349395752, + "num_tokens": 452758315.0, + "step": 11862 + }, + { + "epoch": 1.5090955349192214, + "grad_norm": 1.5152246952056885, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8775869607925415, + "num_tokens": 452796047.0, + "step": 11863 + }, + { + "epoch": 1.509222745197812, + "grad_norm": 1.5218583345413208, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8700565099716187, + "num_tokens": 452835516.0, + "step": 11864 + }, + { + "epoch": 1.5093499554764025, + "grad_norm": 1.5081955194473267, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8907028436660767, + "num_tokens": 452869387.0, + "step": 11865 + }, + { + "epoch": 1.509477165754993, + "grad_norm": 1.3931968212127686, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8851865530014038, + "num_tokens": 452909289.0, + "step": 11866 + }, + { + "epoch": 1.5096043760335836, + "grad_norm": 1.4182240962982178, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8921874165534973, + "num_tokens": 452946886.0, + "step": 11867 + }, + { + "epoch": 1.509731586312174, + "grad_norm": 1.499248743057251, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8615715503692627, + "num_tokens": 452986571.0, + "step": 11868 + }, + { + "epoch": 1.5098587965907644, + "grad_norm": 1.5452213287353516, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8832802772521973, + "num_tokens": 453022267.0, + "step": 11869 + }, + { + "epoch": 1.509986006869355, + "grad_norm": 1.6995848417282104, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8806024789810181, + "num_tokens": 453051476.0, + "step": 11870 + }, + { + "epoch": 1.5101132171479454, + "grad_norm": 1.5132315158843994, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8617451190948486, + "num_tokens": 453090898.0, + "step": 11871 + }, + { + "epoch": 1.510240427426536, + "grad_norm": 1.460748553276062, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8720390796661377, + "num_tokens": 453134785.0, + "step": 11872 + }, + { + "epoch": 1.5103676377051265, + "grad_norm": 1.7212432622909546, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8487997651100159, + "num_tokens": 453170311.0, + "step": 11873 + }, + { + "epoch": 1.510494847983717, + "grad_norm": 1.518468976020813, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8793820142745972, + "num_tokens": 453209892.0, + "step": 11874 + }, + { + "epoch": 1.5106220582623076, + "grad_norm": 1.5770421028137207, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8782735466957092, + "num_tokens": 453244102.0, + "step": 11875 + }, + { + "epoch": 1.510749268540898, + "grad_norm": 1.5013166666030884, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8833614587783813, + "num_tokens": 453280299.0, + "step": 11876 + }, + { + "epoch": 1.5108764788194886, + "grad_norm": 1.4904232025146484, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8839216232299805, + "num_tokens": 453317117.0, + "step": 11877 + }, + { + "epoch": 1.5110036890980791, + "grad_norm": 1.4479771852493286, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8805227279663086, + "num_tokens": 453355346.0, + "step": 11878 + }, + { + "epoch": 1.5111308993766697, + "grad_norm": 1.4255740642547607, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8902838826179504, + "num_tokens": 453393006.0, + "step": 11879 + }, + { + "epoch": 1.5112581096552602, + "grad_norm": 1.5050444602966309, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8666927814483643, + "num_tokens": 453432835.0, + "step": 11880 + }, + { + "epoch": 1.5113853199338507, + "grad_norm": 1.7039515972137451, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8600060343742371, + "num_tokens": 453466788.0, + "step": 11881 + }, + { + "epoch": 1.5115125302124413, + "grad_norm": 1.4375224113464355, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8931341171264648, + "num_tokens": 453502708.0, + "step": 11882 + }, + { + "epoch": 1.5116397404910318, + "grad_norm": 1.3665311336517334, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8816596865653992, + "num_tokens": 453543985.0, + "step": 11883 + }, + { + "epoch": 1.5117669507696223, + "grad_norm": 1.383979082107544, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.86809241771698, + "num_tokens": 453587534.0, + "step": 11884 + }, + { + "epoch": 1.5118941610482128, + "grad_norm": 1.4292548894882202, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8727867007255554, + "num_tokens": 453630111.0, + "step": 11885 + }, + { + "epoch": 1.5120213713268034, + "grad_norm": 1.3757851123809814, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8845458626747131, + "num_tokens": 453671303.0, + "step": 11886 + }, + { + "epoch": 1.5121485816053937, + "grad_norm": 1.4949946403503418, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8827991485595703, + "num_tokens": 453707478.0, + "step": 11887 + }, + { + "epoch": 1.5122757918839842, + "grad_norm": 1.4437191486358643, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8639153242111206, + "num_tokens": 453749681.0, + "step": 11888 + }, + { + "epoch": 1.5124030021625747, + "grad_norm": 1.4327586889266968, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8826022148132324, + "num_tokens": 453788309.0, + "step": 11889 + }, + { + "epoch": 1.5125302124411653, + "grad_norm": 1.5257847309112549, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8607856035232544, + "num_tokens": 453827832.0, + "step": 11890 + }, + { + "epoch": 1.5126574227197558, + "grad_norm": 1.5868992805480957, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8606097102165222, + "num_tokens": 453871853.0, + "step": 11891 + }, + { + "epoch": 1.5127846329983463, + "grad_norm": 1.4887334108352661, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8641213178634644, + "num_tokens": 453914087.0, + "step": 11892 + }, + { + "epoch": 1.5129118432769366, + "grad_norm": 1.6797484159469604, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8687511682510376, + "num_tokens": 453946080.0, + "step": 11893 + }, + { + "epoch": 1.5130390535555271, + "grad_norm": 1.5379269123077393, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8936465382575989, + "num_tokens": 453979890.0, + "step": 11894 + }, + { + "epoch": 1.5131662638341177, + "grad_norm": 1.4331918954849243, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8754473924636841, + "num_tokens": 454025184.0, + "step": 11895 + }, + { + "epoch": 1.5132934741127082, + "grad_norm": 1.5319892168045044, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8729619979858398, + "num_tokens": 454066741.0, + "step": 11896 + }, + { + "epoch": 1.5134206843912987, + "grad_norm": 1.6086952686309814, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8694450855255127, + "num_tokens": 454101706.0, + "step": 11897 + }, + { + "epoch": 1.5135478946698893, + "grad_norm": 1.5765012502670288, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8792191743850708, + "num_tokens": 454140385.0, + "step": 11898 + }, + { + "epoch": 1.5136751049484798, + "grad_norm": 1.4909168481826782, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8775904774665833, + "num_tokens": 454177240.0, + "step": 11899 + }, + { + "epoch": 1.5138023152270703, + "grad_norm": 1.515565276145935, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8597532510757446, + "num_tokens": 454217075.0, + "step": 11900 + }, + { + "epoch": 1.5139295255056608, + "grad_norm": 1.4427142143249512, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8768328428268433, + "num_tokens": 454255683.0, + "step": 11901 + }, + { + "epoch": 1.5140567357842514, + "grad_norm": 1.3481574058532715, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8906869888305664, + "num_tokens": 454296405.0, + "step": 11902 + }, + { + "epoch": 1.514183946062842, + "grad_norm": 1.5737017393112183, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8750894069671631, + "num_tokens": 454333670.0, + "step": 11903 + }, + { + "epoch": 1.5143111563414324, + "grad_norm": 1.5054080486297607, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8720211982727051, + "num_tokens": 454373181.0, + "step": 11904 + }, + { + "epoch": 1.514438366620023, + "grad_norm": 1.4157367944717407, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8786448240280151, + "num_tokens": 454414058.0, + "step": 11905 + }, + { + "epoch": 1.5145655768986135, + "grad_norm": 1.4022760391235352, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8743360042572021, + "num_tokens": 454456320.0, + "step": 11906 + }, + { + "epoch": 1.514692787177204, + "grad_norm": 1.699216365814209, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8725520968437195, + "num_tokens": 454493660.0, + "step": 11907 + }, + { + "epoch": 1.5148199974557945, + "grad_norm": 1.5309648513793945, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8894321918487549, + "num_tokens": 454528067.0, + "step": 11908 + }, + { + "epoch": 1.514947207734385, + "grad_norm": 1.5392910242080688, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8640505075454712, + "num_tokens": 454569220.0, + "step": 11909 + }, + { + "epoch": 1.5150744180129756, + "grad_norm": 1.5038937330245972, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8589099645614624, + "num_tokens": 454612030.0, + "step": 11910 + }, + { + "epoch": 1.515201628291566, + "grad_norm": 1.6669381856918335, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8639768958091736, + "num_tokens": 454648657.0, + "step": 11911 + }, + { + "epoch": 1.5153288385701564, + "grad_norm": 1.552044153213501, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8745003938674927, + "num_tokens": 454686489.0, + "step": 11912 + }, + { + "epoch": 1.515456048848747, + "grad_norm": 1.456168532371521, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8865378499031067, + "num_tokens": 454724187.0, + "step": 11913 + }, + { + "epoch": 1.5155832591273375, + "grad_norm": 1.5430669784545898, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8768885135650635, + "num_tokens": 454763241.0, + "step": 11914 + }, + { + "epoch": 1.515710469405928, + "grad_norm": 1.4185278415679932, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8788630962371826, + "num_tokens": 454805227.0, + "step": 11915 + }, + { + "epoch": 1.5158376796845185, + "grad_norm": 1.6538825035095215, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8629374504089355, + "num_tokens": 454841646.0, + "step": 11916 + }, + { + "epoch": 1.515964889963109, + "grad_norm": 1.5503389835357666, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8796812295913696, + "num_tokens": 454878285.0, + "step": 11917 + }, + { + "epoch": 1.5160921002416994, + "grad_norm": 1.388079047203064, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8894170522689819, + "num_tokens": 454919560.0, + "step": 11918 + }, + { + "epoch": 1.51621931052029, + "grad_norm": 1.5229235887527466, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8828960657119751, + "num_tokens": 454957730.0, + "step": 11919 + }, + { + "epoch": 1.5163465207988804, + "grad_norm": 1.4271067380905151, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8739672899246216, + "num_tokens": 455000207.0, + "step": 11920 + }, + { + "epoch": 1.516473731077471, + "grad_norm": 1.4481464624404907, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8791041374206543, + "num_tokens": 455041909.0, + "step": 11921 + }, + { + "epoch": 1.5166009413560615, + "grad_norm": 1.5972410440444946, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8853575587272644, + "num_tokens": 455072113.0, + "step": 11922 + }, + { + "epoch": 1.516728151634652, + "grad_norm": 1.4736855030059814, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8806082606315613, + "num_tokens": 455111640.0, + "step": 11923 + }, + { + "epoch": 1.5168553619132426, + "grad_norm": 1.421636939048767, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8826125860214233, + "num_tokens": 455154291.0, + "step": 11924 + }, + { + "epoch": 1.516982572191833, + "grad_norm": 1.6749825477600098, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8737388849258423, + "num_tokens": 455184172.0, + "step": 11925 + }, + { + "epoch": 1.5171097824704236, + "grad_norm": 1.464877724647522, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8557133674621582, + "num_tokens": 455230919.0, + "step": 11926 + }, + { + "epoch": 1.5172369927490141, + "grad_norm": 1.441665530204773, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8764982223510742, + "num_tokens": 455275812.0, + "step": 11927 + }, + { + "epoch": 1.5173642030276047, + "grad_norm": 1.501726508140564, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8757587671279907, + "num_tokens": 455313369.0, + "step": 11928 + }, + { + "epoch": 1.5174914133061952, + "grad_norm": 1.508426308631897, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8885689973831177, + "num_tokens": 455347879.0, + "step": 11929 + }, + { + "epoch": 1.5176186235847857, + "grad_norm": 1.5345350503921509, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8774106502532959, + "num_tokens": 455385647.0, + "step": 11930 + }, + { + "epoch": 1.5177458338633762, + "grad_norm": 1.468945860862732, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8851321935653687, + "num_tokens": 455420617.0, + "step": 11931 + }, + { + "epoch": 1.5178730441419668, + "grad_norm": 1.4083458185195923, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8843084573745728, + "num_tokens": 455461327.0, + "step": 11932 + }, + { + "epoch": 1.5180002544205573, + "grad_norm": 1.5503337383270264, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8953981399536133, + "num_tokens": 455494255.0, + "step": 11933 + }, + { + "epoch": 1.5181274646991478, + "grad_norm": 1.4559688568115234, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.87453293800354, + "num_tokens": 455536216.0, + "step": 11934 + }, + { + "epoch": 1.5182546749777384, + "grad_norm": 1.4179059267044067, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8779286742210388, + "num_tokens": 455577689.0, + "step": 11935 + }, + { + "epoch": 1.5183818852563287, + "grad_norm": 1.4729247093200684, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8778802156448364, + "num_tokens": 455618587.0, + "step": 11936 + }, + { + "epoch": 1.5185090955349192, + "grad_norm": 1.5443532466888428, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8517678380012512, + "num_tokens": 455659360.0, + "step": 11937 + }, + { + "epoch": 1.5186363058135097, + "grad_norm": 1.7324053049087524, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.864453911781311, + "num_tokens": 455690942.0, + "step": 11938 + }, + { + "epoch": 1.5187635160921003, + "grad_norm": 1.3938544988632202, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.885019838809967, + "num_tokens": 455735151.0, + "step": 11939 + }, + { + "epoch": 1.5188907263706908, + "grad_norm": 1.5372610092163086, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8731495141983032, + "num_tokens": 455771980.0, + "step": 11940 + }, + { + "epoch": 1.5190179366492813, + "grad_norm": 1.553305745124817, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8635791540145874, + "num_tokens": 455810469.0, + "step": 11941 + }, + { + "epoch": 1.5191451469278716, + "grad_norm": 1.4160099029541016, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8899046778678894, + "num_tokens": 455846148.0, + "step": 11942 + }, + { + "epoch": 1.5192723572064621, + "grad_norm": 1.501894474029541, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8817688226699829, + "num_tokens": 455882202.0, + "step": 11943 + }, + { + "epoch": 1.5193995674850527, + "grad_norm": 1.5757577419281006, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8834194540977478, + "num_tokens": 455913351.0, + "step": 11944 + }, + { + "epoch": 1.5195267777636432, + "grad_norm": 1.609989047050476, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8611581325531006, + "num_tokens": 455947943.0, + "step": 11945 + }, + { + "epoch": 1.5196539880422337, + "grad_norm": 1.506853461265564, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8793283700942993, + "num_tokens": 455985490.0, + "step": 11946 + }, + { + "epoch": 1.5197811983208243, + "grad_norm": 1.4634705781936646, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8845674991607666, + "num_tokens": 456024839.0, + "step": 11947 + }, + { + "epoch": 1.5199084085994148, + "grad_norm": 1.4591468572616577, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8848550319671631, + "num_tokens": 456065918.0, + "step": 11948 + }, + { + "epoch": 1.5200356188780053, + "grad_norm": 1.4221584796905518, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8659515976905823, + "num_tokens": 456110416.0, + "step": 11949 + }, + { + "epoch": 1.5201628291565958, + "grad_norm": 1.4557983875274658, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8931407332420349, + "num_tokens": 456146999.0, + "step": 11950 + }, + { + "epoch": 1.5202900394351864, + "grad_norm": 1.527235984802246, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8773003816604614, + "num_tokens": 456182565.0, + "step": 11951 + }, + { + "epoch": 1.520417249713777, + "grad_norm": 1.450036644935608, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8681653141975403, + "num_tokens": 456225390.0, + "step": 11952 + }, + { + "epoch": 1.5205444599923674, + "grad_norm": 1.6969245672225952, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.860946774482727, + "num_tokens": 456260289.0, + "step": 11953 + }, + { + "epoch": 1.520671670270958, + "grad_norm": 1.481968641281128, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8681246638298035, + "num_tokens": 456303720.0, + "step": 11954 + }, + { + "epoch": 1.5207988805495485, + "grad_norm": 1.3957334756851196, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8874406218528748, + "num_tokens": 456346542.0, + "step": 11955 + }, + { + "epoch": 1.520926090828139, + "grad_norm": 1.4631359577178955, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8833015561103821, + "num_tokens": 456386791.0, + "step": 11956 + }, + { + "epoch": 1.5210533011067295, + "grad_norm": 1.4978805780410767, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.860107421875, + "num_tokens": 456427135.0, + "step": 11957 + }, + { + "epoch": 1.52118051138532, + "grad_norm": 1.463862657546997, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8709432482719421, + "num_tokens": 456470175.0, + "step": 11958 + }, + { + "epoch": 1.5213077216639106, + "grad_norm": 1.7389709949493408, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8720656633377075, + "num_tokens": 456502023.0, + "step": 11959 + }, + { + "epoch": 1.521434931942501, + "grad_norm": 1.476140022277832, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8851311206817627, + "num_tokens": 456540369.0, + "step": 11960 + }, + { + "epoch": 1.5215621422210914, + "grad_norm": 1.519075870513916, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8735182285308838, + "num_tokens": 456578675.0, + "step": 11961 + }, + { + "epoch": 1.521689352499682, + "grad_norm": 1.5645477771759033, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8908017873764038, + "num_tokens": 456611052.0, + "step": 11962 + }, + { + "epoch": 1.5218165627782725, + "grad_norm": 1.5736624002456665, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8821794986724854, + "num_tokens": 456649038.0, + "step": 11963 + }, + { + "epoch": 1.521943773056863, + "grad_norm": 1.4575750827789307, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8857790231704712, + "num_tokens": 456687852.0, + "step": 11964 + }, + { + "epoch": 1.5220709833354535, + "grad_norm": 1.5709370374679565, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8842893838882446, + "num_tokens": 456722681.0, + "step": 11965 + }, + { + "epoch": 1.5221981936140438, + "grad_norm": 1.5023410320281982, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8568053245544434, + "num_tokens": 456767596.0, + "step": 11966 + }, + { + "epoch": 1.5223254038926344, + "grad_norm": 1.417783498764038, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8713443279266357, + "num_tokens": 456809352.0, + "step": 11967 + }, + { + "epoch": 1.522452614171225, + "grad_norm": 1.5282500982284546, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8713337182998657, + "num_tokens": 456848678.0, + "step": 11968 + }, + { + "epoch": 1.5225798244498154, + "grad_norm": 1.5490407943725586, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8819741010665894, + "num_tokens": 456882168.0, + "step": 11969 + }, + { + "epoch": 1.522707034728406, + "grad_norm": 1.6428390741348267, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.866378903388977, + "num_tokens": 456915410.0, + "step": 11970 + }, + { + "epoch": 1.5228342450069965, + "grad_norm": 1.4629219770431519, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8783133029937744, + "num_tokens": 456953588.0, + "step": 11971 + }, + { + "epoch": 1.522961455285587, + "grad_norm": 1.411933422088623, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8658415675163269, + "num_tokens": 456997893.0, + "step": 11972 + }, + { + "epoch": 1.5230886655641775, + "grad_norm": 1.4711512327194214, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8783653378486633, + "num_tokens": 457036439.0, + "step": 11973 + }, + { + "epoch": 1.523215875842768, + "grad_norm": 1.608779788017273, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8716244101524353, + "num_tokens": 457069794.0, + "step": 11974 + }, + { + "epoch": 1.5233430861213586, + "grad_norm": 1.6161712408065796, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.867863655090332, + "num_tokens": 457108851.0, + "step": 11975 + }, + { + "epoch": 1.5234702963999491, + "grad_norm": 1.4566380977630615, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8811787366867065, + "num_tokens": 457149014.0, + "step": 11976 + }, + { + "epoch": 1.5235975066785397, + "grad_norm": 1.776092767715454, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8530187606811523, + "num_tokens": 457182503.0, + "step": 11977 + }, + { + "epoch": 1.5237247169571302, + "grad_norm": 1.6248630285263062, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8702027797698975, + "num_tokens": 457216627.0, + "step": 11978 + }, + { + "epoch": 1.5238519272357207, + "grad_norm": 1.5447745323181152, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8783842325210571, + "num_tokens": 457252646.0, + "step": 11979 + }, + { + "epoch": 1.5239791375143112, + "grad_norm": 1.4426218271255493, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8793608546257019, + "num_tokens": 457292895.0, + "step": 11980 + }, + { + "epoch": 1.5241063477929018, + "grad_norm": 1.5492337942123413, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.846275806427002, + "num_tokens": 457337923.0, + "step": 11981 + }, + { + "epoch": 1.5242335580714923, + "grad_norm": 1.4459786415100098, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8884755373001099, + "num_tokens": 457376112.0, + "step": 11982 + }, + { + "epoch": 1.5243607683500828, + "grad_norm": 1.4730103015899658, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8661136031150818, + "num_tokens": 457415458.0, + "step": 11983 + }, + { + "epoch": 1.5244879786286734, + "grad_norm": 1.613673210144043, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8795903921127319, + "num_tokens": 457451987.0, + "step": 11984 + }, + { + "epoch": 1.5246151889072637, + "grad_norm": 1.4685297012329102, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8683382272720337, + "num_tokens": 457496883.0, + "step": 11985 + }, + { + "epoch": 1.5247423991858542, + "grad_norm": 1.5442523956298828, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8650531768798828, + "num_tokens": 457534712.0, + "step": 11986 + }, + { + "epoch": 1.5248696094644447, + "grad_norm": 1.4760977029800415, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8787763714790344, + "num_tokens": 457576995.0, + "step": 11987 + }, + { + "epoch": 1.5249968197430352, + "grad_norm": 1.7406046390533447, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8713143467903137, + "num_tokens": 457608736.0, + "step": 11988 + }, + { + "epoch": 1.5251240300216258, + "grad_norm": 1.504171371459961, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8813479542732239, + "num_tokens": 457650515.0, + "step": 11989 + }, + { + "epoch": 1.5252512403002163, + "grad_norm": 1.501003384590149, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8899781107902527, + "num_tokens": 457687133.0, + "step": 11990 + }, + { + "epoch": 1.5253784505788066, + "grad_norm": 1.522005319595337, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8702383041381836, + "num_tokens": 457724487.0, + "step": 11991 + }, + { + "epoch": 1.5255056608573971, + "grad_norm": 1.4411183595657349, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8853652477264404, + "num_tokens": 457761781.0, + "step": 11992 + }, + { + "epoch": 1.5256328711359877, + "grad_norm": 1.573872685432434, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8818292021751404, + "num_tokens": 457795312.0, + "step": 11993 + }, + { + "epoch": 1.5257600814145782, + "grad_norm": 1.5121989250183105, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8575198650360107, + "num_tokens": 457838466.0, + "step": 11994 + }, + { + "epoch": 1.5258872916931687, + "grad_norm": 1.5014897584915161, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8854098320007324, + "num_tokens": 457877515.0, + "step": 11995 + }, + { + "epoch": 1.5260145019717593, + "grad_norm": 1.611867904663086, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8743605613708496, + "num_tokens": 457912949.0, + "step": 11996 + }, + { + "epoch": 1.5261417122503498, + "grad_norm": 1.4476237297058105, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8799052834510803, + "num_tokens": 457953409.0, + "step": 11997 + }, + { + "epoch": 1.5262689225289403, + "grad_norm": 1.7287852764129639, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8627991676330566, + "num_tokens": 457990866.0, + "step": 11998 + }, + { + "epoch": 1.5263961328075308, + "grad_norm": 1.4230239391326904, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8750059604644775, + "num_tokens": 458029887.0, + "step": 11999 + }, + { + "epoch": 1.5265233430861214, + "grad_norm": 1.4966118335723877, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8802635669708252, + "num_tokens": 458070985.0, + "step": 12000 + }, + { + "epoch": 1.526650553364712, + "grad_norm": 1.5894441604614258, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8691327571868896, + "num_tokens": 458109267.0, + "step": 12001 + }, + { + "epoch": 1.5267777636433024, + "grad_norm": 1.5572453737258911, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8653175234794617, + "num_tokens": 458148809.0, + "step": 12002 + }, + { + "epoch": 1.526904973921893, + "grad_norm": 1.5685694217681885, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8736544847488403, + "num_tokens": 458184899.0, + "step": 12003 + }, + { + "epoch": 1.5270321842004835, + "grad_norm": 1.4471757411956787, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8763248324394226, + "num_tokens": 458223965.0, + "step": 12004 + }, + { + "epoch": 1.527159394479074, + "grad_norm": 1.6832811832427979, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8622901439666748, + "num_tokens": 458258099.0, + "step": 12005 + }, + { + "epoch": 1.5272866047576645, + "grad_norm": 1.3423068523406982, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8876593708992004, + "num_tokens": 458299649.0, + "step": 12006 + }, + { + "epoch": 1.527413815036255, + "grad_norm": 1.414434552192688, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8800796270370483, + "num_tokens": 458340402.0, + "step": 12007 + }, + { + "epoch": 1.5275410253148456, + "grad_norm": 1.6189926862716675, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8816331028938293, + "num_tokens": 458377580.0, + "step": 12008 + }, + { + "epoch": 1.527668235593436, + "grad_norm": 1.490214467048645, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8822854161262512, + "num_tokens": 458417321.0, + "step": 12009 + }, + { + "epoch": 1.5277954458720264, + "grad_norm": 1.6156437397003174, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8787804841995239, + "num_tokens": 458453834.0, + "step": 12010 + }, + { + "epoch": 1.527922656150617, + "grad_norm": 1.610276460647583, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8761098384857178, + "num_tokens": 458487506.0, + "step": 12011 + }, + { + "epoch": 1.5280498664292075, + "grad_norm": 1.447468638420105, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.86252760887146, + "num_tokens": 458531871.0, + "step": 12012 + }, + { + "epoch": 1.528177076707798, + "grad_norm": 1.625238060951233, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8692346811294556, + "num_tokens": 458567756.0, + "step": 12013 + }, + { + "epoch": 1.5283042869863885, + "grad_norm": 1.4469836950302124, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8784290552139282, + "num_tokens": 458609019.0, + "step": 12014 + }, + { + "epoch": 1.5284314972649788, + "grad_norm": 1.6484326124191284, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.881700873374939, + "num_tokens": 458646360.0, + "step": 12015 + }, + { + "epoch": 1.5285587075435694, + "grad_norm": 1.5842760801315308, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8731730580329895, + "num_tokens": 458682043.0, + "step": 12016 + }, + { + "epoch": 1.52868591782216, + "grad_norm": 1.5240947008132935, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8782620429992676, + "num_tokens": 458721960.0, + "step": 12017 + }, + { + "epoch": 1.5288131281007504, + "grad_norm": 1.4127708673477173, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8850927948951721, + "num_tokens": 458759070.0, + "step": 12018 + }, + { + "epoch": 1.528940338379341, + "grad_norm": 1.4490209817886353, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8897418975830078, + "num_tokens": 458799371.0, + "step": 12019 + }, + { + "epoch": 1.5290675486579315, + "grad_norm": 1.6208250522613525, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8805803060531616, + "num_tokens": 458832557.0, + "step": 12020 + }, + { + "epoch": 1.529194758936522, + "grad_norm": 1.515893578529358, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8812586069107056, + "num_tokens": 458871660.0, + "step": 12021 + }, + { + "epoch": 1.5293219692151125, + "grad_norm": 1.653012990951538, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8577855825424194, + "num_tokens": 458906546.0, + "step": 12022 + }, + { + "epoch": 1.529449179493703, + "grad_norm": 1.7691200971603394, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8695343732833862, + "num_tokens": 458939776.0, + "step": 12023 + }, + { + "epoch": 1.5295763897722936, + "grad_norm": 1.5613337755203247, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8857033848762512, + "num_tokens": 458974654.0, + "step": 12024 + }, + { + "epoch": 1.5297036000508841, + "grad_norm": 1.4966169595718384, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8755875825881958, + "num_tokens": 459014454.0, + "step": 12025 + }, + { + "epoch": 1.5298308103294747, + "grad_norm": 1.8253767490386963, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8725509643554688, + "num_tokens": 459056500.0, + "step": 12026 + }, + { + "epoch": 1.5299580206080652, + "grad_norm": 1.4767554998397827, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8838354349136353, + "num_tokens": 459097090.0, + "step": 12027 + }, + { + "epoch": 1.5300852308866557, + "grad_norm": 1.4558475017547607, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.865108072757721, + "num_tokens": 459137721.0, + "step": 12028 + }, + { + "epoch": 1.5302124411652462, + "grad_norm": 1.4326728582382202, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8848433494567871, + "num_tokens": 459175325.0, + "step": 12029 + }, + { + "epoch": 1.5303396514438368, + "grad_norm": 1.5267351865768433, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8591073751449585, + "num_tokens": 459215656.0, + "step": 12030 + }, + { + "epoch": 1.5304668617224273, + "grad_norm": 1.4780910015106201, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8926862478256226, + "num_tokens": 459253995.0, + "step": 12031 + }, + { + "epoch": 1.5305940720010178, + "grad_norm": 1.54677152633667, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8765579462051392, + "num_tokens": 459288033.0, + "step": 12032 + }, + { + "epoch": 1.5307212822796084, + "grad_norm": 1.6201320886611938, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8821138143539429, + "num_tokens": 459322695.0, + "step": 12033 + }, + { + "epoch": 1.5308484925581987, + "grad_norm": 1.6998238563537598, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.878332793712616, + "num_tokens": 459354273.0, + "step": 12034 + }, + { + "epoch": 1.5309757028367892, + "grad_norm": 1.388413667678833, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.886859118938446, + "num_tokens": 459394264.0, + "step": 12035 + }, + { + "epoch": 1.5311029131153797, + "grad_norm": 1.4957507848739624, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.870619535446167, + "num_tokens": 459435012.0, + "step": 12036 + }, + { + "epoch": 1.5312301233939702, + "grad_norm": 1.5257740020751953, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8864461183547974, + "num_tokens": 459468341.0, + "step": 12037 + }, + { + "epoch": 1.5313573336725608, + "grad_norm": 1.7059935331344604, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8720803260803223, + "num_tokens": 459501843.0, + "step": 12038 + }, + { + "epoch": 1.5314845439511513, + "grad_norm": 1.476975440979004, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8842654228210449, + "num_tokens": 459537381.0, + "step": 12039 + }, + { + "epoch": 1.5316117542297416, + "grad_norm": 1.4687738418579102, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8666528463363647, + "num_tokens": 459580300.0, + "step": 12040 + }, + { + "epoch": 1.5317389645083321, + "grad_norm": 1.4851484298706055, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8703427314758301, + "num_tokens": 459617928.0, + "step": 12041 + }, + { + "epoch": 1.5318661747869227, + "grad_norm": 1.5397332906723022, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8613671660423279, + "num_tokens": 459657579.0, + "step": 12042 + }, + { + "epoch": 1.5319933850655132, + "grad_norm": 1.6018472909927368, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8724570274353027, + "num_tokens": 459695631.0, + "step": 12043 + }, + { + "epoch": 1.5321205953441037, + "grad_norm": 1.5836126804351807, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8777933120727539, + "num_tokens": 459731355.0, + "step": 12044 + }, + { + "epoch": 1.5322478056226942, + "grad_norm": 1.6139740943908691, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8871905207633972, + "num_tokens": 459762811.0, + "step": 12045 + }, + { + "epoch": 1.5323750159012848, + "grad_norm": 1.556000828742981, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8730877637863159, + "num_tokens": 459802516.0, + "step": 12046 + }, + { + "epoch": 1.5325022261798753, + "grad_norm": 1.4815585613250732, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8863885998725891, + "num_tokens": 459837957.0, + "step": 12047 + }, + { + "epoch": 1.5326294364584658, + "grad_norm": 1.5050708055496216, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8714629411697388, + "num_tokens": 459878360.0, + "step": 12048 + }, + { + "epoch": 1.5327566467370564, + "grad_norm": 1.4876571893692017, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.879177987575531, + "num_tokens": 459920653.0, + "step": 12049 + }, + { + "epoch": 1.532883857015647, + "grad_norm": 1.4182045459747314, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8796059489250183, + "num_tokens": 459961852.0, + "step": 12050 + }, + { + "epoch": 1.5330110672942374, + "grad_norm": 1.5654945373535156, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8732227087020874, + "num_tokens": 459995591.0, + "step": 12051 + }, + { + "epoch": 1.533138277572828, + "grad_norm": 1.438903570175171, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.885883092880249, + "num_tokens": 460036339.0, + "step": 12052 + }, + { + "epoch": 1.5332654878514185, + "grad_norm": 1.619174599647522, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.88751220703125, + "num_tokens": 460066143.0, + "step": 12053 + }, + { + "epoch": 1.533392698130009, + "grad_norm": 1.6159800291061401, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8783319592475891, + "num_tokens": 460101392.0, + "step": 12054 + }, + { + "epoch": 1.5335199084085995, + "grad_norm": 1.6600559949874878, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8635056614875793, + "num_tokens": 460137657.0, + "step": 12055 + }, + { + "epoch": 1.53364711868719, + "grad_norm": 1.4658490419387817, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8784707188606262, + "num_tokens": 460181943.0, + "step": 12056 + }, + { + "epoch": 1.5337743289657806, + "grad_norm": 1.5496575832366943, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8649096488952637, + "num_tokens": 460223154.0, + "step": 12057 + }, + { + "epoch": 1.533901539244371, + "grad_norm": 1.6149338483810425, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8788074851036072, + "num_tokens": 460260330.0, + "step": 12058 + }, + { + "epoch": 1.5340287495229614, + "grad_norm": 1.5090484619140625, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.857255220413208, + "num_tokens": 460302422.0, + "step": 12059 + }, + { + "epoch": 1.534155959801552, + "grad_norm": 1.404792308807373, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8920173048973083, + "num_tokens": 460345069.0, + "step": 12060 + }, + { + "epoch": 1.5342831700801425, + "grad_norm": 1.6068974733352661, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8654216527938843, + "num_tokens": 460382030.0, + "step": 12061 + }, + { + "epoch": 1.534410380358733, + "grad_norm": 1.5647279024124146, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8673375248908997, + "num_tokens": 460419134.0, + "step": 12062 + }, + { + "epoch": 1.5345375906373235, + "grad_norm": 1.4719266891479492, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8938136100769043, + "num_tokens": 460455909.0, + "step": 12063 + }, + { + "epoch": 1.5346648009159138, + "grad_norm": 1.5175855159759521, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.890874981880188, + "num_tokens": 460489108.0, + "step": 12064 + }, + { + "epoch": 1.5347920111945044, + "grad_norm": 1.6042561531066895, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8653832674026489, + "num_tokens": 460525343.0, + "step": 12065 + }, + { + "epoch": 1.534919221473095, + "grad_norm": 1.4243172407150269, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.879020631313324, + "num_tokens": 460569926.0, + "step": 12066 + }, + { + "epoch": 1.5350464317516854, + "grad_norm": 1.5337004661560059, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8683401942253113, + "num_tokens": 460610099.0, + "step": 12067 + }, + { + "epoch": 1.535173642030276, + "grad_norm": 1.6570757627487183, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8767517805099487, + "num_tokens": 460644140.0, + "step": 12068 + }, + { + "epoch": 1.5353008523088665, + "grad_norm": 1.5099544525146484, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8864940404891968, + "num_tokens": 460681888.0, + "step": 12069 + }, + { + "epoch": 1.535428062587457, + "grad_norm": 1.45450758934021, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8936360478401184, + "num_tokens": 460721811.0, + "step": 12070 + }, + { + "epoch": 1.5355552728660475, + "grad_norm": 1.639851689338684, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8756421804428101, + "num_tokens": 460756732.0, + "step": 12071 + }, + { + "epoch": 1.535682483144638, + "grad_norm": 1.6080598831176758, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8849743008613586, + "num_tokens": 460790267.0, + "step": 12072 + }, + { + "epoch": 1.5358096934232286, + "grad_norm": 1.536232829093933, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.888921856880188, + "num_tokens": 460822564.0, + "step": 12073 + }, + { + "epoch": 1.5359369037018191, + "grad_norm": 1.6339311599731445, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.872258722782135, + "num_tokens": 460859802.0, + "step": 12074 + }, + { + "epoch": 1.5360641139804097, + "grad_norm": 1.495369553565979, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.88213050365448, + "num_tokens": 460899430.0, + "step": 12075 + }, + { + "epoch": 1.5361913242590002, + "grad_norm": 1.5324043035507202, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8858119249343872, + "num_tokens": 460935015.0, + "step": 12076 + }, + { + "epoch": 1.5363185345375907, + "grad_norm": 1.4591622352600098, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8676600456237793, + "num_tokens": 460974914.0, + "step": 12077 + }, + { + "epoch": 1.5364457448161812, + "grad_norm": 1.5994583368301392, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8748740553855896, + "num_tokens": 461010112.0, + "step": 12078 + }, + { + "epoch": 1.5365729550947718, + "grad_norm": 1.5142064094543457, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8580036759376526, + "num_tokens": 461051014.0, + "step": 12079 + }, + { + "epoch": 1.5367001653733623, + "grad_norm": 1.5642869472503662, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8713996410369873, + "num_tokens": 461087440.0, + "step": 12080 + }, + { + "epoch": 1.5368273756519528, + "grad_norm": 1.5572153329849243, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8745341300964355, + "num_tokens": 461122762.0, + "step": 12081 + }, + { + "epoch": 1.5369545859305433, + "grad_norm": 1.4390159845352173, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8746253252029419, + "num_tokens": 461164550.0, + "step": 12082 + }, + { + "epoch": 1.5370817962091337, + "grad_norm": 1.6060329675674438, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.869615912437439, + "num_tokens": 461201231.0, + "step": 12083 + }, + { + "epoch": 1.5372090064877242, + "grad_norm": 1.5617618560791016, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.889393150806427, + "num_tokens": 461235115.0, + "step": 12084 + }, + { + "epoch": 1.5373362167663147, + "grad_norm": 1.7282713651657104, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8661563992500305, + "num_tokens": 461268687.0, + "step": 12085 + }, + { + "epoch": 1.5374634270449052, + "grad_norm": 1.4964162111282349, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8834277391433716, + "num_tokens": 461307925.0, + "step": 12086 + }, + { + "epoch": 1.5375906373234958, + "grad_norm": 1.5748262405395508, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8694539070129395, + "num_tokens": 461347200.0, + "step": 12087 + }, + { + "epoch": 1.5377178476020863, + "grad_norm": 1.4941376447677612, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8767794370651245, + "num_tokens": 461389560.0, + "step": 12088 + }, + { + "epoch": 1.5378450578806766, + "grad_norm": 1.5843982696533203, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8660829663276672, + "num_tokens": 461424924.0, + "step": 12089 + }, + { + "epoch": 1.5379722681592671, + "grad_norm": 1.5237131118774414, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8752415776252747, + "num_tokens": 461463675.0, + "step": 12090 + }, + { + "epoch": 1.5380994784378577, + "grad_norm": 1.5921239852905273, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8832207322120667, + "num_tokens": 461500707.0, + "step": 12091 + }, + { + "epoch": 1.5382266887164482, + "grad_norm": 1.5122714042663574, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8653549551963806, + "num_tokens": 461540624.0, + "step": 12092 + }, + { + "epoch": 1.5383538989950387, + "grad_norm": 1.4932355880737305, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8778326511383057, + "num_tokens": 461580347.0, + "step": 12093 + }, + { + "epoch": 1.5384811092736292, + "grad_norm": 1.6368439197540283, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8645703792572021, + "num_tokens": 461611384.0, + "step": 12094 + }, + { + "epoch": 1.5386083195522198, + "grad_norm": 1.4942928552627563, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.872695803642273, + "num_tokens": 461648340.0, + "step": 12095 + }, + { + "epoch": 1.5387355298308103, + "grad_norm": 1.3270044326782227, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.88300621509552, + "num_tokens": 461694839.0, + "step": 12096 + }, + { + "epoch": 1.5388627401094008, + "grad_norm": 1.5504155158996582, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8769657611846924, + "num_tokens": 461730247.0, + "step": 12097 + }, + { + "epoch": 1.5389899503879914, + "grad_norm": 1.6230406761169434, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8658853769302368, + "num_tokens": 461764329.0, + "step": 12098 + }, + { + "epoch": 1.5391171606665819, + "grad_norm": 1.4632188081741333, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.886106014251709, + "num_tokens": 461800734.0, + "step": 12099 + }, + { + "epoch": 1.5392443709451724, + "grad_norm": 1.504285454750061, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8903243541717529, + "num_tokens": 461835367.0, + "step": 12100 + }, + { + "epoch": 1.539371581223763, + "grad_norm": 1.4970260858535767, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8831879496574402, + "num_tokens": 461872091.0, + "step": 12101 + }, + { + "epoch": 1.5394987915023535, + "grad_norm": 1.6031506061553955, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8779546022415161, + "num_tokens": 461908977.0, + "step": 12102 + }, + { + "epoch": 1.539626001780944, + "grad_norm": 1.4239946603775024, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8854899406433105, + "num_tokens": 461948472.0, + "step": 12103 + }, + { + "epoch": 1.5397532120595345, + "grad_norm": 1.476438045501709, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8684933185577393, + "num_tokens": 461994196.0, + "step": 12104 + }, + { + "epoch": 1.539880422338125, + "grad_norm": 1.4973998069763184, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8753336668014526, + "num_tokens": 462036208.0, + "step": 12105 + }, + { + "epoch": 1.5400076326167156, + "grad_norm": 1.4596410989761353, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8947988748550415, + "num_tokens": 462073176.0, + "step": 12106 + }, + { + "epoch": 1.5401348428953059, + "grad_norm": 1.8315175771713257, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8679825663566589, + "num_tokens": 462104030.0, + "step": 12107 + }, + { + "epoch": 1.5402620531738964, + "grad_norm": 1.7392388582229614, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8708078861236572, + "num_tokens": 462137385.0, + "step": 12108 + }, + { + "epoch": 1.540389263452487, + "grad_norm": 1.6664494276046753, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8800202012062073, + "num_tokens": 462170433.0, + "step": 12109 + }, + { + "epoch": 1.5405164737310775, + "grad_norm": 1.4681943655014038, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8770850300788879, + "num_tokens": 462212516.0, + "step": 12110 + }, + { + "epoch": 1.540643684009668, + "grad_norm": 1.5491046905517578, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8806595206260681, + "num_tokens": 462248184.0, + "step": 12111 + }, + { + "epoch": 1.5407708942882585, + "grad_norm": 1.3886713981628418, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8803597688674927, + "num_tokens": 462290877.0, + "step": 12112 + }, + { + "epoch": 1.5408981045668488, + "grad_norm": 1.441906452178955, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8834676742553711, + "num_tokens": 462331037.0, + "step": 12113 + }, + { + "epoch": 1.5410253148454394, + "grad_norm": 1.5920573472976685, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8730638027191162, + "num_tokens": 462365858.0, + "step": 12114 + }, + { + "epoch": 1.54115252512403, + "grad_norm": 1.9371479749679565, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8692426681518555, + "num_tokens": 462395822.0, + "step": 12115 + }, + { + "epoch": 1.5412797354026204, + "grad_norm": 1.56193208694458, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8803203105926514, + "num_tokens": 462431868.0, + "step": 12116 + }, + { + "epoch": 1.541406945681211, + "grad_norm": 1.6160537004470825, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8728787899017334, + "num_tokens": 462468418.0, + "step": 12117 + }, + { + "epoch": 1.5415341559598015, + "grad_norm": 1.5766569375991821, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8761475086212158, + "num_tokens": 462503657.0, + "step": 12118 + }, + { + "epoch": 1.541661366238392, + "grad_norm": 1.6601417064666748, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8794823884963989, + "num_tokens": 462535274.0, + "step": 12119 + }, + { + "epoch": 1.5417885765169825, + "grad_norm": 1.4652835130691528, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8823798894882202, + "num_tokens": 462572223.0, + "step": 12120 + }, + { + "epoch": 1.541915786795573, + "grad_norm": 1.412359595298767, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8761926889419556, + "num_tokens": 462616096.0, + "step": 12121 + }, + { + "epoch": 1.5420429970741636, + "grad_norm": 1.6079310178756714, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8728886246681213, + "num_tokens": 462650325.0, + "step": 12122 + }, + { + "epoch": 1.5421702073527541, + "grad_norm": 1.5335367918014526, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8765835762023926, + "num_tokens": 462691515.0, + "step": 12123 + }, + { + "epoch": 1.5422974176313446, + "grad_norm": 1.428877830505371, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8782337307929993, + "num_tokens": 462731127.0, + "step": 12124 + }, + { + "epoch": 1.5424246279099352, + "grad_norm": 1.4749172925949097, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8852043747901917, + "num_tokens": 462768525.0, + "step": 12125 + }, + { + "epoch": 1.5425518381885257, + "grad_norm": 1.3856769800186157, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8828979730606079, + "num_tokens": 462806616.0, + "step": 12126 + }, + { + "epoch": 1.5426790484671162, + "grad_norm": 1.4408940076828003, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8774589896202087, + "num_tokens": 462845698.0, + "step": 12127 + }, + { + "epoch": 1.5428062587457068, + "grad_norm": 1.5224601030349731, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8848825693130493, + "num_tokens": 462880818.0, + "step": 12128 + }, + { + "epoch": 1.5429334690242973, + "grad_norm": 1.4798619747161865, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.865207850933075, + "num_tokens": 462918696.0, + "step": 12129 + }, + { + "epoch": 1.5430606793028878, + "grad_norm": 1.5998685359954834, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8657840490341187, + "num_tokens": 462956670.0, + "step": 12130 + }, + { + "epoch": 1.5431878895814783, + "grad_norm": 1.6022930145263672, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8814288973808289, + "num_tokens": 462989159.0, + "step": 12131 + }, + { + "epoch": 1.5433150998600687, + "grad_norm": 1.7974295616149902, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8879314064979553, + "num_tokens": 463019775.0, + "step": 12132 + }, + { + "epoch": 1.5434423101386592, + "grad_norm": 1.4997315406799316, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.884880781173706, + "num_tokens": 463059264.0, + "step": 12133 + }, + { + "epoch": 1.5435695204172497, + "grad_norm": 1.5068368911743164, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8763284087181091, + "num_tokens": 463096300.0, + "step": 12134 + }, + { + "epoch": 1.5436967306958402, + "grad_norm": 1.6050626039505005, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8776293396949768, + "num_tokens": 463127241.0, + "step": 12135 + }, + { + "epoch": 1.5438239409744308, + "grad_norm": 1.4291316270828247, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8712477087974548, + "num_tokens": 463168077.0, + "step": 12136 + }, + { + "epoch": 1.5439511512530213, + "grad_norm": 1.4659173488616943, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8717210292816162, + "num_tokens": 463210548.0, + "step": 12137 + }, + { + "epoch": 1.5440783615316116, + "grad_norm": 1.3849486112594604, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8757981061935425, + "num_tokens": 463253325.0, + "step": 12138 + }, + { + "epoch": 1.5442055718102021, + "grad_norm": 1.4360257387161255, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8781169652938843, + "num_tokens": 463294799.0, + "step": 12139 + }, + { + "epoch": 1.5443327820887927, + "grad_norm": 1.5379060506820679, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8729080557823181, + "num_tokens": 463333525.0, + "step": 12140 + }, + { + "epoch": 1.5444599923673832, + "grad_norm": 1.5293906927108765, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.865673303604126, + "num_tokens": 463371459.0, + "step": 12141 + }, + { + "epoch": 1.5445872026459737, + "grad_norm": 1.48844313621521, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8618833422660828, + "num_tokens": 463413189.0, + "step": 12142 + }, + { + "epoch": 1.5447144129245642, + "grad_norm": 1.6486554145812988, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8561506867408752, + "num_tokens": 463447674.0, + "step": 12143 + }, + { + "epoch": 1.5448416232031548, + "grad_norm": 1.4366353750228882, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8678520917892456, + "num_tokens": 463492360.0, + "step": 12144 + }, + { + "epoch": 1.5449688334817453, + "grad_norm": 1.5223145484924316, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.881062388420105, + "num_tokens": 463526155.0, + "step": 12145 + }, + { + "epoch": 1.5450960437603358, + "grad_norm": 1.5902507305145264, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8809287548065186, + "num_tokens": 463560427.0, + "step": 12146 + }, + { + "epoch": 1.5452232540389264, + "grad_norm": 1.5633447170257568, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8860617876052856, + "num_tokens": 463592400.0, + "step": 12147 + }, + { + "epoch": 1.5453504643175169, + "grad_norm": 1.4260215759277344, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8911436796188354, + "num_tokens": 463630972.0, + "step": 12148 + }, + { + "epoch": 1.5454776745961074, + "grad_norm": 1.6070384979248047, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8727942705154419, + "num_tokens": 463666943.0, + "step": 12149 + }, + { + "epoch": 1.545604884874698, + "grad_norm": 1.475569725036621, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8852340579032898, + "num_tokens": 463707988.0, + "step": 12150 + }, + { + "epoch": 1.5457320951532885, + "grad_norm": 1.5918725728988647, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8684883713722229, + "num_tokens": 463745876.0, + "step": 12151 + }, + { + "epoch": 1.545859305431879, + "grad_norm": 1.6113133430480957, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8777296543121338, + "num_tokens": 463780511.0, + "step": 12152 + }, + { + "epoch": 1.5459865157104695, + "grad_norm": 1.567484736442566, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8738162517547607, + "num_tokens": 463819189.0, + "step": 12153 + }, + { + "epoch": 1.54611372598906, + "grad_norm": 1.493251085281372, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8917292952537537, + "num_tokens": 463853148.0, + "step": 12154 + }, + { + "epoch": 1.5462409362676506, + "grad_norm": 1.6211886405944824, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8648275136947632, + "num_tokens": 463886976.0, + "step": 12155 + }, + { + "epoch": 1.5463681465462409, + "grad_norm": 1.5502920150756836, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8663396835327148, + "num_tokens": 463926287.0, + "step": 12156 + }, + { + "epoch": 1.5464953568248314, + "grad_norm": 1.6021777391433716, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8793164491653442, + "num_tokens": 463962269.0, + "step": 12157 + }, + { + "epoch": 1.546622567103422, + "grad_norm": 1.6081068515777588, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8728201985359192, + "num_tokens": 463998322.0, + "step": 12158 + }, + { + "epoch": 1.5467497773820125, + "grad_norm": 1.4977961778640747, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8717781901359558, + "num_tokens": 464035752.0, + "step": 12159 + }, + { + "epoch": 1.546876987660603, + "grad_norm": 1.5163731575012207, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.873309850692749, + "num_tokens": 464071803.0, + "step": 12160 + }, + { + "epoch": 1.5470041979391935, + "grad_norm": 1.4287965297698975, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8776845335960388, + "num_tokens": 464114516.0, + "step": 12161 + }, + { + "epoch": 1.5471314082177838, + "grad_norm": 1.5315760374069214, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8797838091850281, + "num_tokens": 464150342.0, + "step": 12162 + }, + { + "epoch": 1.5472586184963744, + "grad_norm": 1.565880537033081, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8798140287399292, + "num_tokens": 464187357.0, + "step": 12163 + }, + { + "epoch": 1.5473858287749649, + "grad_norm": 1.4444812536239624, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8831572532653809, + "num_tokens": 464224187.0, + "step": 12164 + }, + { + "epoch": 1.5475130390535554, + "grad_norm": 1.600699543952942, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8756520748138428, + "num_tokens": 464256655.0, + "step": 12165 + }, + { + "epoch": 1.547640249332146, + "grad_norm": 1.5635652542114258, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8784357905387878, + "num_tokens": 464292466.0, + "step": 12166 + }, + { + "epoch": 1.5477674596107365, + "grad_norm": 1.4942692518234253, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8741810321807861, + "num_tokens": 464331154.0, + "step": 12167 + }, + { + "epoch": 1.547894669889327, + "grad_norm": 1.507460117340088, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8671538829803467, + "num_tokens": 464371238.0, + "step": 12168 + }, + { + "epoch": 1.5480218801679175, + "grad_norm": 1.5843161344528198, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.871536374092102, + "num_tokens": 464407359.0, + "step": 12169 + }, + { + "epoch": 1.548149090446508, + "grad_norm": 1.524557113647461, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8840539455413818, + "num_tokens": 464445374.0, + "step": 12170 + }, + { + "epoch": 1.5482763007250986, + "grad_norm": 1.3173408508300781, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8935223817825317, + "num_tokens": 464490291.0, + "step": 12171 + }, + { + "epoch": 1.5484035110036891, + "grad_norm": 1.4950439929962158, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8742603063583374, + "num_tokens": 464529206.0, + "step": 12172 + }, + { + "epoch": 1.5485307212822796, + "grad_norm": 1.5746785402297974, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8668912053108215, + "num_tokens": 464567662.0, + "step": 12173 + }, + { + "epoch": 1.5486579315608702, + "grad_norm": 1.447300910949707, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8814669847488403, + "num_tokens": 464607620.0, + "step": 12174 + }, + { + "epoch": 1.5487851418394607, + "grad_norm": 1.7314289808273315, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8702148199081421, + "num_tokens": 464646093.0, + "step": 12175 + }, + { + "epoch": 1.5489123521180512, + "grad_norm": 1.677341341972351, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8595471382141113, + "num_tokens": 464681440.0, + "step": 12176 + }, + { + "epoch": 1.5490395623966418, + "grad_norm": 1.5637530088424683, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8758987188339233, + "num_tokens": 464719943.0, + "step": 12177 + }, + { + "epoch": 1.5491667726752323, + "grad_norm": 1.508042812347412, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8862769603729248, + "num_tokens": 464755060.0, + "step": 12178 + }, + { + "epoch": 1.5492939829538228, + "grad_norm": 1.392366886138916, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8779062628746033, + "num_tokens": 464801000.0, + "step": 12179 + }, + { + "epoch": 1.5494211932324133, + "grad_norm": 1.3645787239074707, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8797001242637634, + "num_tokens": 464842949.0, + "step": 12180 + }, + { + "epoch": 1.5495484035110036, + "grad_norm": 1.4259912967681885, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8693345785140991, + "num_tokens": 464885548.0, + "step": 12181 + }, + { + "epoch": 1.5496756137895942, + "grad_norm": 1.594939947128296, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8755632638931274, + "num_tokens": 464922021.0, + "step": 12182 + }, + { + "epoch": 1.5498028240681847, + "grad_norm": 1.5038063526153564, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8689053058624268, + "num_tokens": 464962087.0, + "step": 12183 + }, + { + "epoch": 1.5499300343467752, + "grad_norm": 1.5023905038833618, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8739304542541504, + "num_tokens": 465000182.0, + "step": 12184 + }, + { + "epoch": 1.5500572446253658, + "grad_norm": 1.4415041208267212, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8767876625061035, + "num_tokens": 465042404.0, + "step": 12185 + }, + { + "epoch": 1.5501844549039563, + "grad_norm": 1.576087236404419, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8445843458175659, + "num_tokens": 465082862.0, + "step": 12186 + }, + { + "epoch": 1.5503116651825466, + "grad_norm": 1.5037189722061157, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8688417673110962, + "num_tokens": 465124468.0, + "step": 12187 + }, + { + "epoch": 1.5504388754611371, + "grad_norm": 1.543222427368164, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8795663118362427, + "num_tokens": 465157276.0, + "step": 12188 + }, + { + "epoch": 1.5505660857397277, + "grad_norm": 1.5248534679412842, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8564989566802979, + "num_tokens": 465199352.0, + "step": 12189 + }, + { + "epoch": 1.5506932960183182, + "grad_norm": 1.6426444053649902, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8840334415435791, + "num_tokens": 465229561.0, + "step": 12190 + }, + { + "epoch": 1.5508205062969087, + "grad_norm": 1.6622705459594727, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8755168914794922, + "num_tokens": 465260553.0, + "step": 12191 + }, + { + "epoch": 1.5509477165754992, + "grad_norm": 1.4997553825378418, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8771587610244751, + "num_tokens": 465299635.0, + "step": 12192 + }, + { + "epoch": 1.5510749268540898, + "grad_norm": 1.526286005973816, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8768344521522522, + "num_tokens": 465338248.0, + "step": 12193 + }, + { + "epoch": 1.5512021371326803, + "grad_norm": 1.3854703903198242, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8847191333770752, + "num_tokens": 465380874.0, + "step": 12194 + }, + { + "epoch": 1.5513293474112708, + "grad_norm": 1.5666319131851196, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8767917156219482, + "num_tokens": 465418484.0, + "step": 12195 + }, + { + "epoch": 1.5514565576898613, + "grad_norm": 1.5715148448944092, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.873348593711853, + "num_tokens": 465455680.0, + "step": 12196 + }, + { + "epoch": 1.5515837679684519, + "grad_norm": 1.3817662000656128, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8911439180374146, + "num_tokens": 465496367.0, + "step": 12197 + }, + { + "epoch": 1.5517109782470424, + "grad_norm": 1.6305516958236694, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8745702505111694, + "num_tokens": 465528654.0, + "step": 12198 + }, + { + "epoch": 1.551838188525633, + "grad_norm": 1.6423481702804565, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.879814088344574, + "num_tokens": 465559263.0, + "step": 12199 + }, + { + "epoch": 1.5519653988042235, + "grad_norm": 1.4464763402938843, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8834700584411621, + "num_tokens": 465598188.0, + "step": 12200 + }, + { + "epoch": 1.552092609082814, + "grad_norm": 1.506230354309082, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8755828142166138, + "num_tokens": 465636365.0, + "step": 12201 + }, + { + "epoch": 1.5522198193614045, + "grad_norm": 1.5583510398864746, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.871198296546936, + "num_tokens": 465679697.0, + "step": 12202 + }, + { + "epoch": 1.552347029639995, + "grad_norm": 1.506636619567871, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8774722218513489, + "num_tokens": 465717770.0, + "step": 12203 + }, + { + "epoch": 1.5524742399185856, + "grad_norm": 1.393799901008606, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8861204385757446, + "num_tokens": 465759776.0, + "step": 12204 + }, + { + "epoch": 1.5526014501971759, + "grad_norm": 1.4870339632034302, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8752020597457886, + "num_tokens": 465800998.0, + "step": 12205 + }, + { + "epoch": 1.5527286604757664, + "grad_norm": 1.5826915502548218, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8640281558036804, + "num_tokens": 465837457.0, + "step": 12206 + }, + { + "epoch": 1.552855870754357, + "grad_norm": 1.3567839860916138, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8730071783065796, + "num_tokens": 465882186.0, + "step": 12207 + }, + { + "epoch": 1.5529830810329475, + "grad_norm": 1.4031052589416504, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8918365836143494, + "num_tokens": 465922754.0, + "step": 12208 + }, + { + "epoch": 1.553110291311538, + "grad_norm": 1.5218983888626099, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8777368068695068, + "num_tokens": 465959682.0, + "step": 12209 + }, + { + "epoch": 1.5532375015901285, + "grad_norm": 1.5521674156188965, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8819363117218018, + "num_tokens": 465994751.0, + "step": 12210 + }, + { + "epoch": 1.5533647118687188, + "grad_norm": 1.570035696029663, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8808077573776245, + "num_tokens": 466032659.0, + "step": 12211 + }, + { + "epoch": 1.5534919221473094, + "grad_norm": 1.413978934288025, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8903998136520386, + "num_tokens": 466069842.0, + "step": 12212 + }, + { + "epoch": 1.5536191324258999, + "grad_norm": 1.3999890089035034, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8893173336982727, + "num_tokens": 466110823.0, + "step": 12213 + }, + { + "epoch": 1.5537463427044904, + "grad_norm": 1.5034806728363037, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8751335144042969, + "num_tokens": 466146271.0, + "step": 12214 + }, + { + "epoch": 1.553873552983081, + "grad_norm": 1.4891475439071655, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8728125095367432, + "num_tokens": 466188999.0, + "step": 12215 + }, + { + "epoch": 1.5540007632616715, + "grad_norm": 1.5599011182785034, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8745369911193848, + "num_tokens": 466224314.0, + "step": 12216 + }, + { + "epoch": 1.554127973540262, + "grad_norm": 1.5480347871780396, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8665039539337158, + "num_tokens": 466260591.0, + "step": 12217 + }, + { + "epoch": 1.5542551838188525, + "grad_norm": 1.4980061054229736, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8741298913955688, + "num_tokens": 466298878.0, + "step": 12218 + }, + { + "epoch": 1.554382394097443, + "grad_norm": 1.4709538221359253, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8939430117607117, + "num_tokens": 466335156.0, + "step": 12219 + }, + { + "epoch": 1.5545096043760336, + "grad_norm": 1.720725655555725, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8536368608474731, + "num_tokens": 466368972.0, + "step": 12220 + }, + { + "epoch": 1.554636814654624, + "grad_norm": 1.5019012689590454, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8738495111465454, + "num_tokens": 466406106.0, + "step": 12221 + }, + { + "epoch": 1.5547640249332146, + "grad_norm": 1.4707825183868408, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8660818338394165, + "num_tokens": 466449026.0, + "step": 12222 + }, + { + "epoch": 1.5548912352118052, + "grad_norm": 1.4830482006072998, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8757516145706177, + "num_tokens": 466485903.0, + "step": 12223 + }, + { + "epoch": 1.5550184454903957, + "grad_norm": 1.4232903718948364, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8660746216773987, + "num_tokens": 466527735.0, + "step": 12224 + }, + { + "epoch": 1.5551456557689862, + "grad_norm": 1.4520255327224731, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8899636268615723, + "num_tokens": 466565719.0, + "step": 12225 + }, + { + "epoch": 1.5552728660475768, + "grad_norm": 1.4432313442230225, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8712001442909241, + "num_tokens": 466605864.0, + "step": 12226 + }, + { + "epoch": 1.5554000763261673, + "grad_norm": 1.3637202978134155, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8869529962539673, + "num_tokens": 466647603.0, + "step": 12227 + }, + { + "epoch": 1.5555272866047578, + "grad_norm": 1.6018362045288086, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8777254223823547, + "num_tokens": 466678722.0, + "step": 12228 + }, + { + "epoch": 1.5556544968833483, + "grad_norm": 1.4518818855285645, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8755732774734497, + "num_tokens": 466719458.0, + "step": 12229 + }, + { + "epoch": 1.5557817071619386, + "grad_norm": 1.4983409643173218, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8855273723602295, + "num_tokens": 466759496.0, + "step": 12230 + }, + { + "epoch": 1.5559089174405292, + "grad_norm": 1.683186650276184, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8648507595062256, + "num_tokens": 466796849.0, + "step": 12231 + }, + { + "epoch": 1.5560361277191197, + "grad_norm": 1.5117110013961792, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.877373993396759, + "num_tokens": 466834405.0, + "step": 12232 + }, + { + "epoch": 1.5561633379977102, + "grad_norm": 1.6015610694885254, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.872693657875061, + "num_tokens": 466869140.0, + "step": 12233 + }, + { + "epoch": 1.5562905482763008, + "grad_norm": 1.6299959421157837, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8824124336242676, + "num_tokens": 466902029.0, + "step": 12234 + }, + { + "epoch": 1.5564177585548913, + "grad_norm": 1.4395085573196411, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8919650316238403, + "num_tokens": 466939378.0, + "step": 12235 + }, + { + "epoch": 1.5565449688334816, + "grad_norm": 1.3791700601577759, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8876785039901733, + "num_tokens": 466981859.0, + "step": 12236 + }, + { + "epoch": 1.5566721791120721, + "grad_norm": 1.5929253101348877, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8651127815246582, + "num_tokens": 467020992.0, + "step": 12237 + }, + { + "epoch": 1.5567993893906626, + "grad_norm": 1.4448069334030151, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8874792456626892, + "num_tokens": 467055376.0, + "step": 12238 + }, + { + "epoch": 1.5569265996692532, + "grad_norm": 1.7410609722137451, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8586830496788025, + "num_tokens": 467091439.0, + "step": 12239 + }, + { + "epoch": 1.5570538099478437, + "grad_norm": 1.6843836307525635, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8733407855033875, + "num_tokens": 467124725.0, + "step": 12240 + }, + { + "epoch": 1.5571810202264342, + "grad_norm": 1.4962416887283325, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8748293519020081, + "num_tokens": 467160792.0, + "step": 12241 + }, + { + "epoch": 1.5573082305050248, + "grad_norm": 1.5079147815704346, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8734482526779175, + "num_tokens": 467202922.0, + "step": 12242 + }, + { + "epoch": 1.5574354407836153, + "grad_norm": 1.6138756275177002, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.861829400062561, + "num_tokens": 467244939.0, + "step": 12243 + }, + { + "epoch": 1.5575626510622058, + "grad_norm": 1.6471794843673706, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8648639917373657, + "num_tokens": 467278936.0, + "step": 12244 + }, + { + "epoch": 1.5576898613407963, + "grad_norm": 1.5486140251159668, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8777986168861389, + "num_tokens": 467314584.0, + "step": 12245 + }, + { + "epoch": 1.5578170716193869, + "grad_norm": 1.5831401348114014, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8733145594596863, + "num_tokens": 467350595.0, + "step": 12246 + }, + { + "epoch": 1.5579442818979774, + "grad_norm": 1.745505452156067, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8816113471984863, + "num_tokens": 467383430.0, + "step": 12247 + }, + { + "epoch": 1.558071492176568, + "grad_norm": 1.5409258604049683, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8677207231521606, + "num_tokens": 467420554.0, + "step": 12248 + }, + { + "epoch": 1.5581987024551585, + "grad_norm": 1.4493532180786133, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.888108491897583, + "num_tokens": 467460680.0, + "step": 12249 + }, + { + "epoch": 1.558325912733749, + "grad_norm": 1.487686276435852, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.872441291809082, + "num_tokens": 467500798.0, + "step": 12250 + }, + { + "epoch": 1.5584531230123395, + "grad_norm": 1.8433059453964233, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8607060313224792, + "num_tokens": 467531571.0, + "step": 12251 + }, + { + "epoch": 1.55858033329093, + "grad_norm": 1.5320414304733276, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8785959482192993, + "num_tokens": 467569485.0, + "step": 12252 + }, + { + "epoch": 1.5587075435695206, + "grad_norm": 1.5952436923980713, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8627465963363647, + "num_tokens": 467606901.0, + "step": 12253 + }, + { + "epoch": 1.5588347538481109, + "grad_norm": 1.4555763006210327, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8676362633705139, + "num_tokens": 467648240.0, + "step": 12254 + }, + { + "epoch": 1.5589619641267014, + "grad_norm": 1.6732497215270996, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8741931915283203, + "num_tokens": 467683011.0, + "step": 12255 + }, + { + "epoch": 1.559089174405292, + "grad_norm": 1.5613435506820679, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8736558556556702, + "num_tokens": 467719574.0, + "step": 12256 + }, + { + "epoch": 1.5592163846838825, + "grad_norm": 1.4510949850082397, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8781570196151733, + "num_tokens": 467758966.0, + "step": 12257 + }, + { + "epoch": 1.559343594962473, + "grad_norm": 1.453481674194336, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8761129379272461, + "num_tokens": 467795656.0, + "step": 12258 + }, + { + "epoch": 1.5594708052410635, + "grad_norm": 1.5350431203842163, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8743687868118286, + "num_tokens": 467834345.0, + "step": 12259 + }, + { + "epoch": 1.5595980155196538, + "grad_norm": 1.4827110767364502, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8793938755989075, + "num_tokens": 467870732.0, + "step": 12260 + }, + { + "epoch": 1.5597252257982444, + "grad_norm": 1.5363270044326782, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8798840641975403, + "num_tokens": 467909760.0, + "step": 12261 + }, + { + "epoch": 1.5598524360768349, + "grad_norm": 1.444960117340088, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8872429132461548, + "num_tokens": 467949392.0, + "step": 12262 + }, + { + "epoch": 1.5599796463554254, + "grad_norm": 1.4189069271087646, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8909624814987183, + "num_tokens": 467984229.0, + "step": 12263 + }, + { + "epoch": 1.560106856634016, + "grad_norm": 1.463818073272705, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8608901500701904, + "num_tokens": 468025287.0, + "step": 12264 + }, + { + "epoch": 1.5602340669126065, + "grad_norm": 1.4845387935638428, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8737168312072754, + "num_tokens": 468063875.0, + "step": 12265 + }, + { + "epoch": 1.560361277191197, + "grad_norm": 1.5126852989196777, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8739161491394043, + "num_tokens": 468103145.0, + "step": 12266 + }, + { + "epoch": 1.5604884874697875, + "grad_norm": 1.6606773138046265, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8867273926734924, + "num_tokens": 468137265.0, + "step": 12267 + }, + { + "epoch": 1.560615697748378, + "grad_norm": 1.6197130680084229, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8835041522979736, + "num_tokens": 468169031.0, + "step": 12268 + }, + { + "epoch": 1.5607429080269686, + "grad_norm": 1.6124752759933472, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8698916435241699, + "num_tokens": 468203289.0, + "step": 12269 + }, + { + "epoch": 1.560870118305559, + "grad_norm": 1.4845906496047974, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.866085410118103, + "num_tokens": 468245509.0, + "step": 12270 + }, + { + "epoch": 1.5609973285841496, + "grad_norm": 1.4877495765686035, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8801337480545044, + "num_tokens": 468279428.0, + "step": 12271 + }, + { + "epoch": 1.5611245388627402, + "grad_norm": 1.4271594285964966, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8854088187217712, + "num_tokens": 468317189.0, + "step": 12272 + }, + { + "epoch": 1.5612517491413307, + "grad_norm": 1.3536406755447388, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8912702202796936, + "num_tokens": 468358555.0, + "step": 12273 + }, + { + "epoch": 1.5613789594199212, + "grad_norm": 1.467518925666809, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8722622394561768, + "num_tokens": 468400216.0, + "step": 12274 + }, + { + "epoch": 1.5615061696985117, + "grad_norm": 1.5265895128250122, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8839053511619568, + "num_tokens": 468434342.0, + "step": 12275 + }, + { + "epoch": 1.5616333799771023, + "grad_norm": 1.655921459197998, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8686041831970215, + "num_tokens": 468467344.0, + "step": 12276 + }, + { + "epoch": 1.5617605902556928, + "grad_norm": 1.5762757062911987, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8765978813171387, + "num_tokens": 468501884.0, + "step": 12277 + }, + { + "epoch": 1.5618878005342833, + "grad_norm": 1.4190336465835571, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8934962749481201, + "num_tokens": 468541952.0, + "step": 12278 + }, + { + "epoch": 1.5620150108128736, + "grad_norm": 1.6444562673568726, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8894336223602295, + "num_tokens": 468573789.0, + "step": 12279 + }, + { + "epoch": 1.5621422210914642, + "grad_norm": 1.4498533010482788, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.894254207611084, + "num_tokens": 468607696.0, + "step": 12280 + }, + { + "epoch": 1.5622694313700547, + "grad_norm": 1.7304683923721313, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.880660891532898, + "num_tokens": 468642835.0, + "step": 12281 + }, + { + "epoch": 1.5623966416486452, + "grad_norm": 1.7295479774475098, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8753759860992432, + "num_tokens": 468674444.0, + "step": 12282 + }, + { + "epoch": 1.5625238519272358, + "grad_norm": 1.4251837730407715, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8872438669204712, + "num_tokens": 468716729.0, + "step": 12283 + }, + { + "epoch": 1.5626510622058263, + "grad_norm": 1.5587466955184937, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8751633167266846, + "num_tokens": 468753561.0, + "step": 12284 + }, + { + "epoch": 1.5627782724844166, + "grad_norm": 1.4495769739151, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8729769587516785, + "num_tokens": 468795328.0, + "step": 12285 + }, + { + "epoch": 1.5629054827630071, + "grad_norm": 1.4603230953216553, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8798454999923706, + "num_tokens": 468835766.0, + "step": 12286 + }, + { + "epoch": 1.5630326930415976, + "grad_norm": 1.635477900505066, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8695942759513855, + "num_tokens": 468867905.0, + "step": 12287 + }, + { + "epoch": 1.5631599033201882, + "grad_norm": 1.5336573123931885, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.86763596534729, + "num_tokens": 468906695.0, + "step": 12288 + }, + { + "epoch": 1.5632871135987787, + "grad_norm": 1.6050688028335571, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8672776818275452, + "num_tokens": 468940754.0, + "step": 12289 + }, + { + "epoch": 1.5634143238773692, + "grad_norm": 1.6001698970794678, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.873759925365448, + "num_tokens": 468979383.0, + "step": 12290 + }, + { + "epoch": 1.5635415341559598, + "grad_norm": 1.5232391357421875, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8758213520050049, + "num_tokens": 469017020.0, + "step": 12291 + }, + { + "epoch": 1.5636687444345503, + "grad_norm": 1.4492254257202148, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8801538944244385, + "num_tokens": 469060036.0, + "step": 12292 + }, + { + "epoch": 1.5637959547131408, + "grad_norm": 1.3203901052474976, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8933467864990234, + "num_tokens": 469101064.0, + "step": 12293 + }, + { + "epoch": 1.5639231649917313, + "grad_norm": 1.4767789840698242, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8854755759239197, + "num_tokens": 469144476.0, + "step": 12294 + }, + { + "epoch": 1.5640503752703219, + "grad_norm": 1.4510332345962524, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8879657983779907, + "num_tokens": 469183541.0, + "step": 12295 + }, + { + "epoch": 1.5641775855489124, + "grad_norm": 1.4592911005020142, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8777417540550232, + "num_tokens": 469221456.0, + "step": 12296 + }, + { + "epoch": 1.564304795827503, + "grad_norm": 1.3854745626449585, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8765286207199097, + "num_tokens": 469267000.0, + "step": 12297 + }, + { + "epoch": 1.5644320061060935, + "grad_norm": 1.5370169878005981, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8785879611968994, + "num_tokens": 469306360.0, + "step": 12298 + }, + { + "epoch": 1.564559216384684, + "grad_norm": 1.6600333452224731, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8548822402954102, + "num_tokens": 469344074.0, + "step": 12299 + }, + { + "epoch": 1.5646864266632745, + "grad_norm": 1.4433401823043823, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8926454782485962, + "num_tokens": 469384468.0, + "step": 12300 + }, + { + "epoch": 1.564813636941865, + "grad_norm": 1.6777105331420898, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8544407486915588, + "num_tokens": 469423823.0, + "step": 12301 + }, + { + "epoch": 1.5649408472204556, + "grad_norm": 1.5127991437911987, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8751074075698853, + "num_tokens": 469465815.0, + "step": 12302 + }, + { + "epoch": 1.5650680574990459, + "grad_norm": 1.4703936576843262, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8892333507537842, + "num_tokens": 469500284.0, + "step": 12303 + }, + { + "epoch": 1.5651952677776364, + "grad_norm": 1.4553052186965942, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8879134654998779, + "num_tokens": 469540881.0, + "step": 12304 + }, + { + "epoch": 1.565322478056227, + "grad_norm": 1.6295337677001953, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8660573959350586, + "num_tokens": 469579170.0, + "step": 12305 + }, + { + "epoch": 1.5654496883348175, + "grad_norm": 1.5601608753204346, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.889933705329895, + "num_tokens": 469610849.0, + "step": 12306 + }, + { + "epoch": 1.565576898613408, + "grad_norm": 1.5755406618118286, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8722249269485474, + "num_tokens": 469647420.0, + "step": 12307 + }, + { + "epoch": 1.5657041088919985, + "grad_norm": 1.4129761457443237, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8712882399559021, + "num_tokens": 469692412.0, + "step": 12308 + }, + { + "epoch": 1.5658313191705888, + "grad_norm": 1.3815832138061523, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8899271488189697, + "num_tokens": 469733513.0, + "step": 12309 + }, + { + "epoch": 1.5659585294491793, + "grad_norm": 1.5125761032104492, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8746922016143799, + "num_tokens": 469769687.0, + "step": 12310 + }, + { + "epoch": 1.5660857397277699, + "grad_norm": 1.5667959451675415, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.872370719909668, + "num_tokens": 469806047.0, + "step": 12311 + }, + { + "epoch": 1.5662129500063604, + "grad_norm": 1.4287091493606567, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8819238543510437, + "num_tokens": 469845518.0, + "step": 12312 + }, + { + "epoch": 1.566340160284951, + "grad_norm": 1.541313886642456, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8822224140167236, + "num_tokens": 469881123.0, + "step": 12313 + }, + { + "epoch": 1.5664673705635415, + "grad_norm": 1.458986520767212, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8731619119644165, + "num_tokens": 469920182.0, + "step": 12314 + }, + { + "epoch": 1.566594580842132, + "grad_norm": 1.4919332265853882, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8845847249031067, + "num_tokens": 469955739.0, + "step": 12315 + }, + { + "epoch": 1.5667217911207225, + "grad_norm": 1.407599925994873, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8723236918449402, + "num_tokens": 469997839.0, + "step": 12316 + }, + { + "epoch": 1.566849001399313, + "grad_norm": 1.5999168157577515, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.867546558380127, + "num_tokens": 470034237.0, + "step": 12317 + }, + { + "epoch": 1.5669762116779036, + "grad_norm": 1.4763447046279907, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8745777010917664, + "num_tokens": 470073855.0, + "step": 12318 + }, + { + "epoch": 1.567103421956494, + "grad_norm": 1.5583839416503906, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8786300420761108, + "num_tokens": 470108995.0, + "step": 12319 + }, + { + "epoch": 1.5672306322350846, + "grad_norm": 1.5594987869262695, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8814605474472046, + "num_tokens": 470141944.0, + "step": 12320 + }, + { + "epoch": 1.5673578425136752, + "grad_norm": 1.3696714639663696, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8832811117172241, + "num_tokens": 470182710.0, + "step": 12321 + }, + { + "epoch": 1.5674850527922657, + "grad_norm": 1.6204521656036377, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8669945001602173, + "num_tokens": 470220732.0, + "step": 12322 + }, + { + "epoch": 1.5676122630708562, + "grad_norm": 1.6412591934204102, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.876522958278656, + "num_tokens": 470253965.0, + "step": 12323 + }, + { + "epoch": 1.5677394733494467, + "grad_norm": 1.5411252975463867, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8692349195480347, + "num_tokens": 470295955.0, + "step": 12324 + }, + { + "epoch": 1.5678666836280373, + "grad_norm": 1.5737553834915161, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8897767066955566, + "num_tokens": 470336981.0, + "step": 12325 + }, + { + "epoch": 1.5679938939066278, + "grad_norm": 1.6220721006393433, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8662947416305542, + "num_tokens": 470375570.0, + "step": 12326 + }, + { + "epoch": 1.5681211041852183, + "grad_norm": 1.5241270065307617, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8658528923988342, + "num_tokens": 470413476.0, + "step": 12327 + }, + { + "epoch": 1.5682483144638086, + "grad_norm": 1.5902163982391357, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8716682195663452, + "num_tokens": 470447404.0, + "step": 12328 + }, + { + "epoch": 1.5683755247423992, + "grad_norm": 1.447296380996704, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8826426267623901, + "num_tokens": 470489469.0, + "step": 12329 + }, + { + "epoch": 1.5685027350209897, + "grad_norm": 1.4061042070388794, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8862083554267883, + "num_tokens": 470531519.0, + "step": 12330 + }, + { + "epoch": 1.5686299452995802, + "grad_norm": 1.4959356784820557, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8753836154937744, + "num_tokens": 470571897.0, + "step": 12331 + }, + { + "epoch": 1.5687571555781707, + "grad_norm": 1.6307462453842163, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8781900405883789, + "num_tokens": 470602687.0, + "step": 12332 + }, + { + "epoch": 1.5688843658567613, + "grad_norm": 1.448778748512268, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8723487854003906, + "num_tokens": 470642744.0, + "step": 12333 + }, + { + "epoch": 1.5690115761353516, + "grad_norm": 1.4812535047531128, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8645368814468384, + "num_tokens": 470688411.0, + "step": 12334 + }, + { + "epoch": 1.569138786413942, + "grad_norm": 1.3733235597610474, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8828127980232239, + "num_tokens": 470729131.0, + "step": 12335 + }, + { + "epoch": 1.5692659966925326, + "grad_norm": 1.5832421779632568, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8649849891662598, + "num_tokens": 470766942.0, + "step": 12336 + }, + { + "epoch": 1.5693932069711232, + "grad_norm": 1.3976905345916748, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8910806179046631, + "num_tokens": 470807590.0, + "step": 12337 + }, + { + "epoch": 1.5695204172497137, + "grad_norm": 1.5489227771759033, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8760390281677246, + "num_tokens": 470842468.0, + "step": 12338 + }, + { + "epoch": 1.5696476275283042, + "grad_norm": 1.581990122795105, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8751976490020752, + "num_tokens": 470881675.0, + "step": 12339 + }, + { + "epoch": 1.5697748378068948, + "grad_norm": 1.5320734977722168, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8500348329544067, + "num_tokens": 470922594.0, + "step": 12340 + }, + { + "epoch": 1.5699020480854853, + "grad_norm": 1.5250802040100098, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8708560466766357, + "num_tokens": 470960197.0, + "step": 12341 + }, + { + "epoch": 1.5700292583640758, + "grad_norm": 1.510796308517456, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.857235312461853, + "num_tokens": 470997948.0, + "step": 12342 + }, + { + "epoch": 1.5701564686426663, + "grad_norm": 1.5924838781356812, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8614328503608704, + "num_tokens": 471032499.0, + "step": 12343 + }, + { + "epoch": 1.5702836789212569, + "grad_norm": 1.3875430822372437, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8972446322441101, + "num_tokens": 471070603.0, + "step": 12344 + }, + { + "epoch": 1.5704108891998474, + "grad_norm": 1.604304313659668, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8630311489105225, + "num_tokens": 471108651.0, + "step": 12345 + }, + { + "epoch": 1.570538099478438, + "grad_norm": 1.5144381523132324, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8849369287490845, + "num_tokens": 471145165.0, + "step": 12346 + }, + { + "epoch": 1.5706653097570284, + "grad_norm": 1.6194498538970947, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8833959102630615, + "num_tokens": 471175289.0, + "step": 12347 + }, + { + "epoch": 1.570792520035619, + "grad_norm": 1.405025601387024, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8770794868469238, + "num_tokens": 471217080.0, + "step": 12348 + }, + { + "epoch": 1.5709197303142095, + "grad_norm": 1.4960602521896362, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8790110945701599, + "num_tokens": 471255892.0, + "step": 12349 + }, + { + "epoch": 1.5710469405928, + "grad_norm": 1.6005737781524658, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8770619630813599, + "num_tokens": 471293010.0, + "step": 12350 + }, + { + "epoch": 1.5711741508713906, + "grad_norm": 1.4688960313796997, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8694049119949341, + "num_tokens": 471332798.0, + "step": 12351 + }, + { + "epoch": 1.5713013611499809, + "grad_norm": 1.5914788246154785, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.883259117603302, + "num_tokens": 471364323.0, + "step": 12352 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 1.6310217380523682, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8762855529785156, + "num_tokens": 471399905.0, + "step": 12353 + }, + { + "epoch": 1.571555781707162, + "grad_norm": 1.508630633354187, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8739169239997864, + "num_tokens": 471434612.0, + "step": 12354 + }, + { + "epoch": 1.5716829919857525, + "grad_norm": 1.6486517190933228, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8691165447235107, + "num_tokens": 471467691.0, + "step": 12355 + }, + { + "epoch": 1.571810202264343, + "grad_norm": 1.5127356052398682, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8812975883483887, + "num_tokens": 471505674.0, + "step": 12356 + }, + { + "epoch": 1.5719374125429335, + "grad_norm": 1.5963160991668701, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8682434558868408, + "num_tokens": 471541478.0, + "step": 12357 + }, + { + "epoch": 1.5720646228215238, + "grad_norm": 1.4468035697937012, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8776260614395142, + "num_tokens": 471583420.0, + "step": 12358 + }, + { + "epoch": 1.5721918331001143, + "grad_norm": 1.474504828453064, + "learning_rate": 1e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.8959357142448425, + "num_tokens": 471618010.0, + "step": 12359 + }, + { + "epoch": 1.5723190433787049, + "grad_norm": 1.458481788635254, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.876360297203064, + "num_tokens": 471658210.0, + "step": 12360 + }, + { + "epoch": 1.5724462536572954, + "grad_norm": 1.4117474555969238, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8850278854370117, + "num_tokens": 471698036.0, + "step": 12361 + }, + { + "epoch": 1.572573463935886, + "grad_norm": 1.4632149934768677, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8867264986038208, + "num_tokens": 471735138.0, + "step": 12362 + }, + { + "epoch": 1.5727006742144765, + "grad_norm": 1.552606225013733, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8772897124290466, + "num_tokens": 471771887.0, + "step": 12363 + }, + { + "epoch": 1.572827884493067, + "grad_norm": 1.5604455471038818, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8697060942649841, + "num_tokens": 471811896.0, + "step": 12364 + }, + { + "epoch": 1.5729550947716575, + "grad_norm": 1.676795482635498, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8632620573043823, + "num_tokens": 471845273.0, + "step": 12365 + }, + { + "epoch": 1.573082305050248, + "grad_norm": 1.431325078010559, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.870886504650116, + "num_tokens": 471888076.0, + "step": 12366 + }, + { + "epoch": 1.5732095153288386, + "grad_norm": 1.5561530590057373, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8662316799163818, + "num_tokens": 471924254.0, + "step": 12367 + }, + { + "epoch": 1.573336725607429, + "grad_norm": 1.5398072004318237, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8804963231086731, + "num_tokens": 471959935.0, + "step": 12368 + }, + { + "epoch": 1.5734639358860196, + "grad_norm": 1.5562504529953003, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8863963484764099, + "num_tokens": 471995982.0, + "step": 12369 + }, + { + "epoch": 1.5735911461646102, + "grad_norm": 1.6877145767211914, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8760887980461121, + "num_tokens": 472026890.0, + "step": 12370 + }, + { + "epoch": 1.5737183564432007, + "grad_norm": 1.59735906124115, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8710905313491821, + "num_tokens": 472061877.0, + "step": 12371 + }, + { + "epoch": 1.5738455667217912, + "grad_norm": 1.4015012979507446, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8879595398902893, + "num_tokens": 472102010.0, + "step": 12372 + }, + { + "epoch": 1.5739727770003817, + "grad_norm": 1.5071682929992676, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8842557668685913, + "num_tokens": 472138044.0, + "step": 12373 + }, + { + "epoch": 1.5740999872789723, + "grad_norm": 1.5679813623428345, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8654201626777649, + "num_tokens": 472176005.0, + "step": 12374 + }, + { + "epoch": 1.5742271975575628, + "grad_norm": 1.3716044425964355, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8774383664131165, + "num_tokens": 472223953.0, + "step": 12375 + }, + { + "epoch": 1.5743544078361533, + "grad_norm": 1.504828929901123, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8712388277053833, + "num_tokens": 472266915.0, + "step": 12376 + }, + { + "epoch": 1.5744816181147436, + "grad_norm": 1.6159418821334839, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8785328269004822, + "num_tokens": 472300530.0, + "step": 12377 + }, + { + "epoch": 1.5746088283933342, + "grad_norm": 1.4943219423294067, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.885631799697876, + "num_tokens": 472337952.0, + "step": 12378 + }, + { + "epoch": 1.5747360386719247, + "grad_norm": 1.4230704307556152, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8844569325447083, + "num_tokens": 472377219.0, + "step": 12379 + }, + { + "epoch": 1.5748632489505152, + "grad_norm": 1.4014713764190674, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8848997354507446, + "num_tokens": 472416919.0, + "step": 12380 + }, + { + "epoch": 1.5749904592291057, + "grad_norm": 1.4600356817245483, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8810464143753052, + "num_tokens": 472458209.0, + "step": 12381 + }, + { + "epoch": 1.5751176695076963, + "grad_norm": 1.707879900932312, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8723064661026001, + "num_tokens": 472494010.0, + "step": 12382 + }, + { + "epoch": 1.5752448797862866, + "grad_norm": 1.4127253293991089, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8897818326950073, + "num_tokens": 472537647.0, + "step": 12383 + }, + { + "epoch": 1.575372090064877, + "grad_norm": 1.5315037965774536, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8637882471084595, + "num_tokens": 472578228.0, + "step": 12384 + }, + { + "epoch": 1.5754993003434676, + "grad_norm": 1.4438016414642334, + "learning_rate": 1e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.8944718837738037, + "num_tokens": 472612215.0, + "step": 12385 + }, + { + "epoch": 1.5756265106220582, + "grad_norm": 1.4974784851074219, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8732774257659912, + "num_tokens": 472651786.0, + "step": 12386 + }, + { + "epoch": 1.5757537209006487, + "grad_norm": 1.4897258281707764, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8776006698608398, + "num_tokens": 472691148.0, + "step": 12387 + }, + { + "epoch": 1.5758809311792392, + "grad_norm": 1.4299136400222778, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8903183341026306, + "num_tokens": 472726631.0, + "step": 12388 + }, + { + "epoch": 1.5760081414578297, + "grad_norm": 1.4036937952041626, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8886028528213501, + "num_tokens": 472766653.0, + "step": 12389 + }, + { + "epoch": 1.5761353517364203, + "grad_norm": 1.5552905797958374, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8830642104148865, + "num_tokens": 472801732.0, + "step": 12390 + }, + { + "epoch": 1.5762625620150108, + "grad_norm": 1.4793058633804321, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8677241206169128, + "num_tokens": 472841785.0, + "step": 12391 + }, + { + "epoch": 1.5763897722936013, + "grad_norm": 1.6050773859024048, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8776546716690063, + "num_tokens": 472876444.0, + "step": 12392 + }, + { + "epoch": 1.5765169825721919, + "grad_norm": 1.5169869661331177, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8804938197135925, + "num_tokens": 472912839.0, + "step": 12393 + }, + { + "epoch": 1.5766441928507824, + "grad_norm": 1.4886016845703125, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.890066385269165, + "num_tokens": 472952371.0, + "step": 12394 + }, + { + "epoch": 1.576771403129373, + "grad_norm": 1.477786660194397, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8850447535514832, + "num_tokens": 472990184.0, + "step": 12395 + }, + { + "epoch": 1.5768986134079634, + "grad_norm": 1.5952153205871582, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8708024024963379, + "num_tokens": 473024083.0, + "step": 12396 + }, + { + "epoch": 1.577025823686554, + "grad_norm": 1.4117592573165894, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8829267024993896, + "num_tokens": 473061843.0, + "step": 12397 + }, + { + "epoch": 1.5771530339651445, + "grad_norm": 1.4094250202178955, + "learning_rate": 1e-06, + "loss": 0.2758, + "mean_token_accuracy": 0.9021737575531006, + "num_tokens": 473099073.0, + "step": 12398 + }, + { + "epoch": 1.577280244243735, + "grad_norm": 1.6160622835159302, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8787746429443359, + "num_tokens": 473133116.0, + "step": 12399 + }, + { + "epoch": 1.5774074545223256, + "grad_norm": 1.605187177658081, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8781484365463257, + "num_tokens": 473172890.0, + "step": 12400 + }, + { + "epoch": 1.5775346648009159, + "grad_norm": 1.54531729221344, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8799130320549011, + "num_tokens": 473207988.0, + "step": 12401 + }, + { + "epoch": 1.5776618750795064, + "grad_norm": 1.5055794715881348, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8757957816123962, + "num_tokens": 473249393.0, + "step": 12402 + }, + { + "epoch": 1.577789085358097, + "grad_norm": 1.5308053493499756, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8738528490066528, + "num_tokens": 473287984.0, + "step": 12403 + }, + { + "epoch": 1.5779162956366874, + "grad_norm": 1.5376185178756714, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8778456449508667, + "num_tokens": 473327296.0, + "step": 12404 + }, + { + "epoch": 1.578043505915278, + "grad_norm": 1.6680787801742554, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8729400634765625, + "num_tokens": 473358694.0, + "step": 12405 + }, + { + "epoch": 1.5781707161938685, + "grad_norm": 1.4256677627563477, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8906539678573608, + "num_tokens": 473401109.0, + "step": 12406 + }, + { + "epoch": 1.5782979264724588, + "grad_norm": 1.5829753875732422, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8737280368804932, + "num_tokens": 473438307.0, + "step": 12407 + }, + { + "epoch": 1.5784251367510493, + "grad_norm": 1.5103543996810913, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8737968802452087, + "num_tokens": 473475516.0, + "step": 12408 + }, + { + "epoch": 1.5785523470296399, + "grad_norm": 1.481068730354309, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.881808876991272, + "num_tokens": 473515062.0, + "step": 12409 + }, + { + "epoch": 1.5786795573082304, + "grad_norm": 1.5153824090957642, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8929007649421692, + "num_tokens": 473552754.0, + "step": 12410 + }, + { + "epoch": 1.578806767586821, + "grad_norm": 1.5700854063034058, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8898571729660034, + "num_tokens": 473585096.0, + "step": 12411 + }, + { + "epoch": 1.5789339778654115, + "grad_norm": 1.5901652574539185, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8661234974861145, + "num_tokens": 473622638.0, + "step": 12412 + }, + { + "epoch": 1.579061188144002, + "grad_norm": 1.3045082092285156, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8926866054534912, + "num_tokens": 473663933.0, + "step": 12413 + }, + { + "epoch": 1.5791883984225925, + "grad_norm": 1.5180106163024902, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8719280362129211, + "num_tokens": 473703077.0, + "step": 12414 + }, + { + "epoch": 1.579315608701183, + "grad_norm": 1.5044788122177124, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8772659301757812, + "num_tokens": 473744003.0, + "step": 12415 + }, + { + "epoch": 1.5794428189797736, + "grad_norm": 1.5067764520645142, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8778702020645142, + "num_tokens": 473782374.0, + "step": 12416 + }, + { + "epoch": 1.579570029258364, + "grad_norm": 1.4835563898086548, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8743105530738831, + "num_tokens": 473818032.0, + "step": 12417 + }, + { + "epoch": 1.5796972395369546, + "grad_norm": 1.528704047203064, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8629061579704285, + "num_tokens": 473859535.0, + "step": 12418 + }, + { + "epoch": 1.5798244498155452, + "grad_norm": 1.4607254266738892, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8858044147491455, + "num_tokens": 473896594.0, + "step": 12419 + }, + { + "epoch": 1.5799516600941357, + "grad_norm": 1.490078330039978, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8650761842727661, + "num_tokens": 473938594.0, + "step": 12420 + }, + { + "epoch": 1.5800788703727262, + "grad_norm": 1.4317295551300049, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8810352683067322, + "num_tokens": 473976924.0, + "step": 12421 + }, + { + "epoch": 1.5802060806513167, + "grad_norm": 1.4484672546386719, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8830319046974182, + "num_tokens": 474017146.0, + "step": 12422 + }, + { + "epoch": 1.5803332909299073, + "grad_norm": 1.4671396017074585, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8856230974197388, + "num_tokens": 474056138.0, + "step": 12423 + }, + { + "epoch": 1.5804605012084978, + "grad_norm": 1.585426926612854, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8763453960418701, + "num_tokens": 474090312.0, + "step": 12424 + }, + { + "epoch": 1.5805877114870883, + "grad_norm": 1.4620730876922607, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8718891143798828, + "num_tokens": 474130410.0, + "step": 12425 + }, + { + "epoch": 1.5807149217656786, + "grad_norm": 1.3992109298706055, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8707926869392395, + "num_tokens": 474172760.0, + "step": 12426 + }, + { + "epoch": 1.5808421320442692, + "grad_norm": 1.5876034498214722, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8724535703659058, + "num_tokens": 474209036.0, + "step": 12427 + }, + { + "epoch": 1.5809693423228597, + "grad_norm": 1.5185211896896362, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.867170512676239, + "num_tokens": 474248220.0, + "step": 12428 + }, + { + "epoch": 1.5810965526014502, + "grad_norm": 1.3281742334365845, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8919771909713745, + "num_tokens": 474290711.0, + "step": 12429 + }, + { + "epoch": 1.5812237628800407, + "grad_norm": 1.5901050567626953, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8734973669052124, + "num_tokens": 474327614.0, + "step": 12430 + }, + { + "epoch": 1.5813509731586313, + "grad_norm": 1.4234776496887207, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8687005043029785, + "num_tokens": 474373401.0, + "step": 12431 + }, + { + "epoch": 1.5814781834372216, + "grad_norm": 1.4785276651382446, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8804751634597778, + "num_tokens": 474410914.0, + "step": 12432 + }, + { + "epoch": 1.581605393715812, + "grad_norm": 1.487068772315979, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.889437198638916, + "num_tokens": 474450356.0, + "step": 12433 + }, + { + "epoch": 1.5817326039944026, + "grad_norm": 1.6230207681655884, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.860816240310669, + "num_tokens": 474490476.0, + "step": 12434 + }, + { + "epoch": 1.5818598142729932, + "grad_norm": 1.4832614660263062, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8867998719215393, + "num_tokens": 474535074.0, + "step": 12435 + }, + { + "epoch": 1.5819870245515837, + "grad_norm": 1.4801881313323975, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8849225044250488, + "num_tokens": 474574564.0, + "step": 12436 + }, + { + "epoch": 1.5821142348301742, + "grad_norm": 1.4954875707626343, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.874864935874939, + "num_tokens": 474611256.0, + "step": 12437 + }, + { + "epoch": 1.5822414451087647, + "grad_norm": 1.5150798559188843, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8697982430458069, + "num_tokens": 474651878.0, + "step": 12438 + }, + { + "epoch": 1.5823686553873553, + "grad_norm": 1.6538029909133911, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8722225427627563, + "num_tokens": 474689832.0, + "step": 12439 + }, + { + "epoch": 1.5824958656659458, + "grad_norm": 1.3432118892669678, + "learning_rate": 1e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.8986802101135254, + "num_tokens": 474731220.0, + "step": 12440 + }, + { + "epoch": 1.5826230759445363, + "grad_norm": 1.796106219291687, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8536580801010132, + "num_tokens": 474767434.0, + "step": 12441 + }, + { + "epoch": 1.5827502862231269, + "grad_norm": 1.380110740661621, + "learning_rate": 1e-06, + "loss": 0.2827, + "mean_token_accuracy": 0.8967880010604858, + "num_tokens": 474804770.0, + "step": 12442 + }, + { + "epoch": 1.5828774965017174, + "grad_norm": 1.5487604141235352, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.871253252029419, + "num_tokens": 474840228.0, + "step": 12443 + }, + { + "epoch": 1.583004706780308, + "grad_norm": 1.5745810270309448, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.86807781457901, + "num_tokens": 474874545.0, + "step": 12444 + }, + { + "epoch": 1.5831319170588984, + "grad_norm": 1.4778194427490234, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8865916728973389, + "num_tokens": 474913113.0, + "step": 12445 + }, + { + "epoch": 1.583259127337489, + "grad_norm": 1.5557266473770142, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8712595701217651, + "num_tokens": 474953624.0, + "step": 12446 + }, + { + "epoch": 1.5833863376160795, + "grad_norm": 1.6205899715423584, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.876413106918335, + "num_tokens": 474989408.0, + "step": 12447 + }, + { + "epoch": 1.58351354789467, + "grad_norm": 1.6153331995010376, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8789913654327393, + "num_tokens": 475021734.0, + "step": 12448 + }, + { + "epoch": 1.5836407581732606, + "grad_norm": 1.465751051902771, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8679537773132324, + "num_tokens": 475060430.0, + "step": 12449 + }, + { + "epoch": 1.5837679684518509, + "grad_norm": 1.562422513961792, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8742038011550903, + "num_tokens": 475095446.0, + "step": 12450 + }, + { + "epoch": 1.5838951787304414, + "grad_norm": 1.5216978788375854, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8821190595626831, + "num_tokens": 475128994.0, + "step": 12451 + }, + { + "epoch": 1.584022389009032, + "grad_norm": 1.4624135494232178, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8853334188461304, + "num_tokens": 475168232.0, + "step": 12452 + }, + { + "epoch": 1.5841495992876224, + "grad_norm": 1.4266620874404907, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8849384784698486, + "num_tokens": 475205262.0, + "step": 12453 + }, + { + "epoch": 1.584276809566213, + "grad_norm": 1.487514615058899, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.865959644317627, + "num_tokens": 475244630.0, + "step": 12454 + }, + { + "epoch": 1.5844040198448035, + "grad_norm": 1.438684105873108, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8890684843063354, + "num_tokens": 475279589.0, + "step": 12455 + }, + { + "epoch": 1.5845312301233938, + "grad_norm": 1.4493566751480103, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8619647026062012, + "num_tokens": 475325177.0, + "step": 12456 + }, + { + "epoch": 1.5846584404019843, + "grad_norm": 1.4921547174453735, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8853520750999451, + "num_tokens": 475362052.0, + "step": 12457 + }, + { + "epoch": 1.5847856506805749, + "grad_norm": 1.5665478706359863, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.866127073764801, + "num_tokens": 475399421.0, + "step": 12458 + }, + { + "epoch": 1.5849128609591654, + "grad_norm": 1.4859110116958618, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8849799633026123, + "num_tokens": 475436396.0, + "step": 12459 + }, + { + "epoch": 1.585040071237756, + "grad_norm": 1.4232666492462158, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8877663016319275, + "num_tokens": 475471988.0, + "step": 12460 + }, + { + "epoch": 1.5851672815163464, + "grad_norm": 1.5021796226501465, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8751391768455505, + "num_tokens": 475508186.0, + "step": 12461 + }, + { + "epoch": 1.585294491794937, + "grad_norm": 1.4743210077285767, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8865855932235718, + "num_tokens": 475547361.0, + "step": 12462 + }, + { + "epoch": 1.5854217020735275, + "grad_norm": 1.6309922933578491, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8683776259422302, + "num_tokens": 475582873.0, + "step": 12463 + }, + { + "epoch": 1.585548912352118, + "grad_norm": 1.476807951927185, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8850917220115662, + "num_tokens": 475621043.0, + "step": 12464 + }, + { + "epoch": 1.5856761226307086, + "grad_norm": 1.5835403203964233, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8750596642494202, + "num_tokens": 475658352.0, + "step": 12465 + }, + { + "epoch": 1.585803332909299, + "grad_norm": 1.4033541679382324, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.878854513168335, + "num_tokens": 475698632.0, + "step": 12466 + }, + { + "epoch": 1.5859305431878896, + "grad_norm": 1.438816785812378, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8724687099456787, + "num_tokens": 475737928.0, + "step": 12467 + }, + { + "epoch": 1.5860577534664801, + "grad_norm": 1.4391276836395264, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8659384250640869, + "num_tokens": 475782086.0, + "step": 12468 + }, + { + "epoch": 1.5861849637450707, + "grad_norm": 1.7423468828201294, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8510063886642456, + "num_tokens": 475814292.0, + "step": 12469 + }, + { + "epoch": 1.5863121740236612, + "grad_norm": 1.388218879699707, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8834638595581055, + "num_tokens": 475860694.0, + "step": 12470 + }, + { + "epoch": 1.5864393843022517, + "grad_norm": 1.6356301307678223, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8711910843849182, + "num_tokens": 475894476.0, + "step": 12471 + }, + { + "epoch": 1.5865665945808423, + "grad_norm": 1.5036479234695435, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8836898803710938, + "num_tokens": 475929305.0, + "step": 12472 + }, + { + "epoch": 1.5866938048594328, + "grad_norm": 1.4968202114105225, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8675230741500854, + "num_tokens": 475970128.0, + "step": 12473 + }, + { + "epoch": 1.5868210151380233, + "grad_norm": 1.3913706541061401, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8782457113265991, + "num_tokens": 476010653.0, + "step": 12474 + }, + { + "epoch": 1.5869482254166136, + "grad_norm": 1.334620714187622, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8723747134208679, + "num_tokens": 476055492.0, + "step": 12475 + }, + { + "epoch": 1.5870754356952042, + "grad_norm": 1.502081036567688, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8784112930297852, + "num_tokens": 476093060.0, + "step": 12476 + }, + { + "epoch": 1.5872026459737947, + "grad_norm": 1.6301971673965454, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8754861950874329, + "num_tokens": 476127507.0, + "step": 12477 + }, + { + "epoch": 1.5873298562523852, + "grad_norm": 1.3692125082015991, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8891736268997192, + "num_tokens": 476172227.0, + "step": 12478 + }, + { + "epoch": 1.5874570665309757, + "grad_norm": 1.3984742164611816, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8780764937400818, + "num_tokens": 476216504.0, + "step": 12479 + }, + { + "epoch": 1.5875842768095663, + "grad_norm": 1.5885852575302124, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8924064040184021, + "num_tokens": 476247612.0, + "step": 12480 + }, + { + "epoch": 1.5877114870881566, + "grad_norm": 1.6935360431671143, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.874350368976593, + "num_tokens": 476281304.0, + "step": 12481 + }, + { + "epoch": 1.587838697366747, + "grad_norm": 1.5306881666183472, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8700814247131348, + "num_tokens": 476321680.0, + "step": 12482 + }, + { + "epoch": 1.5879659076453376, + "grad_norm": 1.4885176420211792, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8768441677093506, + "num_tokens": 476359315.0, + "step": 12483 + }, + { + "epoch": 1.5880931179239282, + "grad_norm": 1.4476559162139893, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8855085372924805, + "num_tokens": 476399091.0, + "step": 12484 + }, + { + "epoch": 1.5882203282025187, + "grad_norm": 1.436522364616394, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.869140088558197, + "num_tokens": 476441830.0, + "step": 12485 + }, + { + "epoch": 1.5883475384811092, + "grad_norm": 1.5070935487747192, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8626775741577148, + "num_tokens": 476482722.0, + "step": 12486 + }, + { + "epoch": 1.5884747487596997, + "grad_norm": 1.4938335418701172, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.865894079208374, + "num_tokens": 476523865.0, + "step": 12487 + }, + { + "epoch": 1.5886019590382903, + "grad_norm": 1.44891357421875, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.9013497829437256, + "num_tokens": 476559658.0, + "step": 12488 + }, + { + "epoch": 1.5887291693168808, + "grad_norm": 1.5808378458023071, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8802924156188965, + "num_tokens": 476599102.0, + "step": 12489 + }, + { + "epoch": 1.5888563795954713, + "grad_norm": 1.5313827991485596, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8602861762046814, + "num_tokens": 476642849.0, + "step": 12490 + }, + { + "epoch": 1.5889835898740619, + "grad_norm": 1.530497670173645, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8690692782402039, + "num_tokens": 476678024.0, + "step": 12491 + }, + { + "epoch": 1.5891108001526524, + "grad_norm": 1.2819184064865112, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8898365497589111, + "num_tokens": 476722718.0, + "step": 12492 + }, + { + "epoch": 1.589238010431243, + "grad_norm": 1.4796854257583618, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8834087252616882, + "num_tokens": 476759510.0, + "step": 12493 + }, + { + "epoch": 1.5893652207098334, + "grad_norm": 1.4844772815704346, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8699890375137329, + "num_tokens": 476797962.0, + "step": 12494 + }, + { + "epoch": 1.589492430988424, + "grad_norm": 1.5185136795043945, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8796199560165405, + "num_tokens": 476836398.0, + "step": 12495 + }, + { + "epoch": 1.5896196412670145, + "grad_norm": 1.416117787361145, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8815295100212097, + "num_tokens": 476878349.0, + "step": 12496 + }, + { + "epoch": 1.589746851545605, + "grad_norm": 1.4871478080749512, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8672593832015991, + "num_tokens": 476919397.0, + "step": 12497 + }, + { + "epoch": 1.5898740618241956, + "grad_norm": 1.5195889472961426, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8808643817901611, + "num_tokens": 476954943.0, + "step": 12498 + }, + { + "epoch": 1.5900012721027859, + "grad_norm": 1.416621446609497, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8829456567764282, + "num_tokens": 476995033.0, + "step": 12499 + }, + { + "epoch": 1.5901284823813764, + "grad_norm": 1.4768249988555908, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8818361759185791, + "num_tokens": 477033150.0, + "step": 12500 + }, + { + "epoch": 1.590255692659967, + "grad_norm": 1.802897334098816, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8613659143447876, + "num_tokens": 477072358.0, + "step": 12501 + }, + { + "epoch": 1.5903829029385574, + "grad_norm": 1.4852232933044434, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8944075107574463, + "num_tokens": 477108223.0, + "step": 12502 + }, + { + "epoch": 1.590510113217148, + "grad_norm": 1.3731714487075806, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8816016316413879, + "num_tokens": 477151271.0, + "step": 12503 + }, + { + "epoch": 1.5906373234957385, + "grad_norm": 1.5714750289916992, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8576303720474243, + "num_tokens": 477190951.0, + "step": 12504 + }, + { + "epoch": 1.5907645337743288, + "grad_norm": 1.3847750425338745, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8841745257377625, + "num_tokens": 477234527.0, + "step": 12505 + }, + { + "epoch": 1.5908917440529193, + "grad_norm": 1.638166904449463, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8651021718978882, + "num_tokens": 477269523.0, + "step": 12506 + }, + { + "epoch": 1.5910189543315099, + "grad_norm": 1.489085078239441, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8760262727737427, + "num_tokens": 477309582.0, + "step": 12507 + }, + { + "epoch": 1.5911461646101004, + "grad_norm": 1.4706535339355469, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8821702003479004, + "num_tokens": 477347120.0, + "step": 12508 + }, + { + "epoch": 1.591273374888691, + "grad_norm": 1.4648561477661133, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8807316422462463, + "num_tokens": 477386531.0, + "step": 12509 + }, + { + "epoch": 1.5914005851672814, + "grad_norm": 1.4790010452270508, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8724358677864075, + "num_tokens": 477426112.0, + "step": 12510 + }, + { + "epoch": 1.591527795445872, + "grad_norm": 1.5265134572982788, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8865879774093628, + "num_tokens": 477463089.0, + "step": 12511 + }, + { + "epoch": 1.5916550057244625, + "grad_norm": 1.3479650020599365, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8875352144241333, + "num_tokens": 477508520.0, + "step": 12512 + }, + { + "epoch": 1.591782216003053, + "grad_norm": 1.402866005897522, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8816075921058655, + "num_tokens": 477548064.0, + "step": 12513 + }, + { + "epoch": 1.5919094262816436, + "grad_norm": 1.6360487937927246, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8527103066444397, + "num_tokens": 477583404.0, + "step": 12514 + }, + { + "epoch": 1.592036636560234, + "grad_norm": 1.6143667697906494, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8701291084289551, + "num_tokens": 477615809.0, + "step": 12515 + }, + { + "epoch": 1.5921638468388246, + "grad_norm": 1.523677110671997, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.887649416923523, + "num_tokens": 477654101.0, + "step": 12516 + }, + { + "epoch": 1.5922910571174151, + "grad_norm": 1.3794151544570923, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8837395906448364, + "num_tokens": 477695569.0, + "step": 12517 + }, + { + "epoch": 1.5924182673960057, + "grad_norm": 1.4718396663665771, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8786889314651489, + "num_tokens": 477733566.0, + "step": 12518 + }, + { + "epoch": 1.5925454776745962, + "grad_norm": 1.4685616493225098, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8708363175392151, + "num_tokens": 477776486.0, + "step": 12519 + }, + { + "epoch": 1.5926726879531867, + "grad_norm": 1.5566625595092773, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8615499138832092, + "num_tokens": 477814885.0, + "step": 12520 + }, + { + "epoch": 1.5927998982317773, + "grad_norm": 1.556996464729309, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8646140098571777, + "num_tokens": 477851959.0, + "step": 12521 + }, + { + "epoch": 1.5929271085103678, + "grad_norm": 1.5070210695266724, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8764461874961853, + "num_tokens": 477889446.0, + "step": 12522 + }, + { + "epoch": 1.5930543187889583, + "grad_norm": 1.59871244430542, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8780428767204285, + "num_tokens": 477924791.0, + "step": 12523 + }, + { + "epoch": 1.5931815290675486, + "grad_norm": 1.3922169208526611, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.876380205154419, + "num_tokens": 477967821.0, + "step": 12524 + }, + { + "epoch": 1.5933087393461391, + "grad_norm": 1.5898792743682861, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8762744665145874, + "num_tokens": 478005664.0, + "step": 12525 + }, + { + "epoch": 1.5934359496247297, + "grad_norm": 1.3634182214736938, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8949083089828491, + "num_tokens": 478046352.0, + "step": 12526 + }, + { + "epoch": 1.5935631599033202, + "grad_norm": 1.3792933225631714, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8744968771934509, + "num_tokens": 478091447.0, + "step": 12527 + }, + { + "epoch": 1.5936903701819107, + "grad_norm": 1.4064115285873413, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8812090158462524, + "num_tokens": 478134165.0, + "step": 12528 + }, + { + "epoch": 1.5938175804605013, + "grad_norm": 1.6756377220153809, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8834406733512878, + "num_tokens": 478173308.0, + "step": 12529 + }, + { + "epoch": 1.5939447907390916, + "grad_norm": 1.489999771118164, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8796374797821045, + "num_tokens": 478212949.0, + "step": 12530 + }, + { + "epoch": 1.594072001017682, + "grad_norm": 1.5002249479293823, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8878674507141113, + "num_tokens": 478249144.0, + "step": 12531 + }, + { + "epoch": 1.5941992112962726, + "grad_norm": 1.5236045122146606, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8712396025657654, + "num_tokens": 478288825.0, + "step": 12532 + }, + { + "epoch": 1.5943264215748632, + "grad_norm": 1.5279430150985718, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8689237833023071, + "num_tokens": 478325668.0, + "step": 12533 + }, + { + "epoch": 1.5944536318534537, + "grad_norm": 1.6819663047790527, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8620436191558838, + "num_tokens": 478360294.0, + "step": 12534 + }, + { + "epoch": 1.5945808421320442, + "grad_norm": 1.4333312511444092, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8840098977088928, + "num_tokens": 478402373.0, + "step": 12535 + }, + { + "epoch": 1.5947080524106347, + "grad_norm": 1.5686626434326172, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8759607076644897, + "num_tokens": 478439079.0, + "step": 12536 + }, + { + "epoch": 1.5948352626892253, + "grad_norm": 1.5079116821289062, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8828799724578857, + "num_tokens": 478476696.0, + "step": 12537 + }, + { + "epoch": 1.5949624729678158, + "grad_norm": 1.6508848667144775, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8638628125190735, + "num_tokens": 478507944.0, + "step": 12538 + }, + { + "epoch": 1.5950896832464063, + "grad_norm": 1.469513177871704, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8667072057723999, + "num_tokens": 478548059.0, + "step": 12539 + }, + { + "epoch": 1.5952168935249968, + "grad_norm": 1.5209397077560425, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8723238706588745, + "num_tokens": 478592017.0, + "step": 12540 + }, + { + "epoch": 1.5953441038035874, + "grad_norm": 1.5235034227371216, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.881798267364502, + "num_tokens": 478627629.0, + "step": 12541 + }, + { + "epoch": 1.595471314082178, + "grad_norm": 1.3877979516983032, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8926516771316528, + "num_tokens": 478668008.0, + "step": 12542 + }, + { + "epoch": 1.5955985243607684, + "grad_norm": 1.4987536668777466, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8732982873916626, + "num_tokens": 478706550.0, + "step": 12543 + }, + { + "epoch": 1.595725734639359, + "grad_norm": 1.5209096670150757, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8683732748031616, + "num_tokens": 478743030.0, + "step": 12544 + }, + { + "epoch": 1.5958529449179495, + "grad_norm": 1.781436800956726, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8751070499420166, + "num_tokens": 478772779.0, + "step": 12545 + }, + { + "epoch": 1.59598015519654, + "grad_norm": 1.438109278678894, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8937228322029114, + "num_tokens": 478807663.0, + "step": 12546 + }, + { + "epoch": 1.5961073654751305, + "grad_norm": 1.4006726741790771, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8829381465911865, + "num_tokens": 478851759.0, + "step": 12547 + }, + { + "epoch": 1.5962345757537209, + "grad_norm": 1.852541446685791, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8353760838508606, + "num_tokens": 478883292.0, + "step": 12548 + }, + { + "epoch": 1.5963617860323114, + "grad_norm": 1.5253431797027588, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8710893392562866, + "num_tokens": 478923766.0, + "step": 12549 + }, + { + "epoch": 1.596488996310902, + "grad_norm": 1.4565142393112183, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8726420402526855, + "num_tokens": 478964582.0, + "step": 12550 + }, + { + "epoch": 1.5966162065894924, + "grad_norm": 1.4200528860092163, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8838384747505188, + "num_tokens": 479003843.0, + "step": 12551 + }, + { + "epoch": 1.596743416868083, + "grad_norm": 1.5880674123764038, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8813194036483765, + "num_tokens": 479036895.0, + "step": 12552 + }, + { + "epoch": 1.5968706271466735, + "grad_norm": 1.4557541608810425, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8680439591407776, + "num_tokens": 479077042.0, + "step": 12553 + }, + { + "epoch": 1.5969978374252638, + "grad_norm": 1.3727216720581055, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8876425623893738, + "num_tokens": 479117646.0, + "step": 12554 + }, + { + "epoch": 1.5971250477038543, + "grad_norm": 1.546587586402893, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8688995838165283, + "num_tokens": 479157627.0, + "step": 12555 + }, + { + "epoch": 1.5972522579824449, + "grad_norm": 1.5775821208953857, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8732731342315674, + "num_tokens": 479190840.0, + "step": 12556 + }, + { + "epoch": 1.5973794682610354, + "grad_norm": 1.5066250562667847, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8818808794021606, + "num_tokens": 479225258.0, + "step": 12557 + }, + { + "epoch": 1.597506678539626, + "grad_norm": 1.6673675775527954, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8747013807296753, + "num_tokens": 479259207.0, + "step": 12558 + }, + { + "epoch": 1.5976338888182164, + "grad_norm": 1.417090654373169, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8695838451385498, + "num_tokens": 479302930.0, + "step": 12559 + }, + { + "epoch": 1.597761099096807, + "grad_norm": 1.4780057668685913, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8837146759033203, + "num_tokens": 479339628.0, + "step": 12560 + }, + { + "epoch": 1.5978883093753975, + "grad_norm": 1.4717718362808228, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8822958469390869, + "num_tokens": 479377832.0, + "step": 12561 + }, + { + "epoch": 1.598015519653988, + "grad_norm": 1.5309475660324097, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8796910047531128, + "num_tokens": 479415408.0, + "step": 12562 + }, + { + "epoch": 1.5981427299325786, + "grad_norm": 1.470660924911499, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8767949938774109, + "num_tokens": 479452726.0, + "step": 12563 + }, + { + "epoch": 1.598269940211169, + "grad_norm": 1.4620212316513062, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8824706673622131, + "num_tokens": 479490072.0, + "step": 12564 + }, + { + "epoch": 1.5983971504897596, + "grad_norm": 1.444699764251709, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8761554956436157, + "num_tokens": 479530058.0, + "step": 12565 + }, + { + "epoch": 1.5985243607683501, + "grad_norm": 1.6540602445602417, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8661173582077026, + "num_tokens": 479565903.0, + "step": 12566 + }, + { + "epoch": 1.5986515710469407, + "grad_norm": 1.503803014755249, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8747807145118713, + "num_tokens": 479607227.0, + "step": 12567 + }, + { + "epoch": 1.5987787813255312, + "grad_norm": 1.4671251773834229, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8832441568374634, + "num_tokens": 479646775.0, + "step": 12568 + }, + { + "epoch": 1.5989059916041217, + "grad_norm": 1.5346885919570923, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8705259561538696, + "num_tokens": 479683873.0, + "step": 12569 + }, + { + "epoch": 1.5990332018827123, + "grad_norm": 1.4318853616714478, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8742623925209045, + "num_tokens": 479722780.0, + "step": 12570 + }, + { + "epoch": 1.5991604121613028, + "grad_norm": 1.413002371788025, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8779854774475098, + "num_tokens": 479764801.0, + "step": 12571 + }, + { + "epoch": 1.5992876224398933, + "grad_norm": 1.5115792751312256, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8889325857162476, + "num_tokens": 479803245.0, + "step": 12572 + }, + { + "epoch": 1.5994148327184836, + "grad_norm": 1.641583800315857, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.872402548789978, + "num_tokens": 479837776.0, + "step": 12573 + }, + { + "epoch": 1.5995420429970741, + "grad_norm": 1.4599809646606445, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8651250004768372, + "num_tokens": 479879743.0, + "step": 12574 + }, + { + "epoch": 1.5996692532756647, + "grad_norm": 1.4723454713821411, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8606568574905396, + "num_tokens": 479924779.0, + "step": 12575 + }, + { + "epoch": 1.5997964635542552, + "grad_norm": 1.44582998752594, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8647404909133911, + "num_tokens": 479966156.0, + "step": 12576 + }, + { + "epoch": 1.5999236738328457, + "grad_norm": 1.5342236757278442, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8815268278121948, + "num_tokens": 479999707.0, + "step": 12577 + }, + { + "epoch": 1.6000508841114363, + "grad_norm": 1.4188607931137085, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8811163902282715, + "num_tokens": 480041505.0, + "step": 12578 + }, + { + "epoch": 1.6001780943900266, + "grad_norm": 1.5283113718032837, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8805446624755859, + "num_tokens": 480075231.0, + "step": 12579 + }, + { + "epoch": 1.600305304668617, + "grad_norm": 1.7689483165740967, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8652464151382446, + "num_tokens": 480106616.0, + "step": 12580 + }, + { + "epoch": 1.6004325149472076, + "grad_norm": 1.4576919078826904, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8771181106567383, + "num_tokens": 480146563.0, + "step": 12581 + }, + { + "epoch": 1.6005597252257981, + "grad_norm": 1.3842198848724365, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8883446455001831, + "num_tokens": 480186277.0, + "step": 12582 + }, + { + "epoch": 1.6006869355043887, + "grad_norm": 1.464264154434204, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8767483830451965, + "num_tokens": 480229013.0, + "step": 12583 + }, + { + "epoch": 1.6008141457829792, + "grad_norm": 1.4424322843551636, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8754004240036011, + "num_tokens": 480269506.0, + "step": 12584 + }, + { + "epoch": 1.6009413560615697, + "grad_norm": 1.5391192436218262, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8720185160636902, + "num_tokens": 480302754.0, + "step": 12585 + }, + { + "epoch": 1.6010685663401603, + "grad_norm": 1.423967719078064, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8758940696716309, + "num_tokens": 480343177.0, + "step": 12586 + }, + { + "epoch": 1.6011957766187508, + "grad_norm": 1.4546380043029785, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8875827789306641, + "num_tokens": 480379675.0, + "step": 12587 + }, + { + "epoch": 1.6013229868973413, + "grad_norm": 1.5636659860610962, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8725482225418091, + "num_tokens": 480416934.0, + "step": 12588 + }, + { + "epoch": 1.6014501971759318, + "grad_norm": 1.7076427936553955, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8729103803634644, + "num_tokens": 480454170.0, + "step": 12589 + }, + { + "epoch": 1.6015774074545224, + "grad_norm": 1.509482741355896, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8762848973274231, + "num_tokens": 480495116.0, + "step": 12590 + }, + { + "epoch": 1.601704617733113, + "grad_norm": 1.4761155843734741, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8810014724731445, + "num_tokens": 480536285.0, + "step": 12591 + }, + { + "epoch": 1.6018318280117034, + "grad_norm": 1.4433774948120117, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8730279803276062, + "num_tokens": 480576161.0, + "step": 12592 + }, + { + "epoch": 1.601959038290294, + "grad_norm": 1.4867010116577148, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8771373629570007, + "num_tokens": 480612897.0, + "step": 12593 + }, + { + "epoch": 1.6020862485688845, + "grad_norm": 1.4965749979019165, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.86626136302948, + "num_tokens": 480652930.0, + "step": 12594 + }, + { + "epoch": 1.602213458847475, + "grad_norm": 1.5213261842727661, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8844276070594788, + "num_tokens": 480692439.0, + "step": 12595 + }, + { + "epoch": 1.6023406691260655, + "grad_norm": 1.5956573486328125, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8771764039993286, + "num_tokens": 480728889.0, + "step": 12596 + }, + { + "epoch": 1.6024678794046558, + "grad_norm": 1.5382097959518433, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8804788589477539, + "num_tokens": 480764900.0, + "step": 12597 + }, + { + "epoch": 1.6025950896832464, + "grad_norm": 1.650212287902832, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8707493543624878, + "num_tokens": 480799264.0, + "step": 12598 + }, + { + "epoch": 1.602722299961837, + "grad_norm": 1.3923717737197876, + "learning_rate": 1e-06, + "loss": 0.2846, + "mean_token_accuracy": 0.8962078094482422, + "num_tokens": 480839191.0, + "step": 12599 + }, + { + "epoch": 1.6028495102404274, + "grad_norm": 1.5581377744674683, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8787481188774109, + "num_tokens": 480876195.0, + "step": 12600 + }, + { + "epoch": 1.602976720519018, + "grad_norm": 1.5789252519607544, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8570632338523865, + "num_tokens": 480914019.0, + "step": 12601 + }, + { + "epoch": 1.6031039307976085, + "grad_norm": 1.550123691558838, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8775124549865723, + "num_tokens": 480949985.0, + "step": 12602 + }, + { + "epoch": 1.6032311410761988, + "grad_norm": 1.3698173761367798, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8878388404846191, + "num_tokens": 480991077.0, + "step": 12603 + }, + { + "epoch": 1.6033583513547893, + "grad_norm": 1.4019907712936401, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8656063079833984, + "num_tokens": 481036696.0, + "step": 12604 + }, + { + "epoch": 1.6034855616333799, + "grad_norm": 1.601886510848999, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8751537799835205, + "num_tokens": 481071639.0, + "step": 12605 + }, + { + "epoch": 1.6036127719119704, + "grad_norm": 1.5085996389389038, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8587827682495117, + "num_tokens": 481110636.0, + "step": 12606 + }, + { + "epoch": 1.603739982190561, + "grad_norm": 1.6184626817703247, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8595739006996155, + "num_tokens": 481150923.0, + "step": 12607 + }, + { + "epoch": 1.6038671924691514, + "grad_norm": 1.5187264680862427, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8709028959274292, + "num_tokens": 481191791.0, + "step": 12608 + }, + { + "epoch": 1.603994402747742, + "grad_norm": 1.4711579084396362, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8741313219070435, + "num_tokens": 481236230.0, + "step": 12609 + }, + { + "epoch": 1.6041216130263325, + "grad_norm": 1.7032917737960815, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8707680702209473, + "num_tokens": 481270004.0, + "step": 12610 + }, + { + "epoch": 1.604248823304923, + "grad_norm": 1.5362331867218018, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8816688060760498, + "num_tokens": 481306483.0, + "step": 12611 + }, + { + "epoch": 1.6043760335835135, + "grad_norm": 1.5323659181594849, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8817261457443237, + "num_tokens": 481341308.0, + "step": 12612 + }, + { + "epoch": 1.604503243862104, + "grad_norm": 1.5867304801940918, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8881908655166626, + "num_tokens": 481373083.0, + "step": 12613 + }, + { + "epoch": 1.6046304541406946, + "grad_norm": 1.4973907470703125, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8769980072975159, + "num_tokens": 481410502.0, + "step": 12614 + }, + { + "epoch": 1.6047576644192851, + "grad_norm": 1.4873653650283813, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8768388628959656, + "num_tokens": 481446817.0, + "step": 12615 + }, + { + "epoch": 1.6048848746978757, + "grad_norm": 1.4570376873016357, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8844356536865234, + "num_tokens": 481486405.0, + "step": 12616 + }, + { + "epoch": 1.6050120849764662, + "grad_norm": 1.6359456777572632, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.87120521068573, + "num_tokens": 481524194.0, + "step": 12617 + }, + { + "epoch": 1.6051392952550567, + "grad_norm": 1.4773069620132446, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8667611479759216, + "num_tokens": 481567824.0, + "step": 12618 + }, + { + "epoch": 1.6052665055336472, + "grad_norm": 1.4835931062698364, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8739152550697327, + "num_tokens": 481607748.0, + "step": 12619 + }, + { + "epoch": 1.6053937158122378, + "grad_norm": 1.4207329750061035, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8733021020889282, + "num_tokens": 481649939.0, + "step": 12620 + }, + { + "epoch": 1.6055209260908283, + "grad_norm": 1.495302438735962, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8815521001815796, + "num_tokens": 481687701.0, + "step": 12621 + }, + { + "epoch": 1.6056481363694186, + "grad_norm": 1.5378825664520264, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8801854848861694, + "num_tokens": 481722701.0, + "step": 12622 + }, + { + "epoch": 1.6057753466480091, + "grad_norm": 1.5359569787979126, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8830561637878418, + "num_tokens": 481759899.0, + "step": 12623 + }, + { + "epoch": 1.6059025569265997, + "grad_norm": 1.5385562181472778, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8664035201072693, + "num_tokens": 481796713.0, + "step": 12624 + }, + { + "epoch": 1.6060297672051902, + "grad_norm": 1.3873958587646484, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.87581467628479, + "num_tokens": 481839766.0, + "step": 12625 + }, + { + "epoch": 1.6061569774837807, + "grad_norm": 1.5800127983093262, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8901717662811279, + "num_tokens": 481874276.0, + "step": 12626 + }, + { + "epoch": 1.6062841877623713, + "grad_norm": 1.3964366912841797, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8648148775100708, + "num_tokens": 481917479.0, + "step": 12627 + }, + { + "epoch": 1.6064113980409616, + "grad_norm": 1.4673676490783691, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8783062696456909, + "num_tokens": 481959133.0, + "step": 12628 + }, + { + "epoch": 1.606538608319552, + "grad_norm": 1.4718989133834839, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8844513893127441, + "num_tokens": 481997085.0, + "step": 12629 + }, + { + "epoch": 1.6066658185981426, + "grad_norm": 1.4792600870132446, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8761996626853943, + "num_tokens": 482033383.0, + "step": 12630 + }, + { + "epoch": 1.6067930288767331, + "grad_norm": 1.5483367443084717, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8828526735305786, + "num_tokens": 482066926.0, + "step": 12631 + }, + { + "epoch": 1.6069202391553237, + "grad_norm": 1.34787118434906, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8806256055831909, + "num_tokens": 482110538.0, + "step": 12632 + }, + { + "epoch": 1.6070474494339142, + "grad_norm": 1.3501709699630737, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8955144882202148, + "num_tokens": 482149064.0, + "step": 12633 + }, + { + "epoch": 1.6071746597125047, + "grad_norm": 1.5942877531051636, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8717589974403381, + "num_tokens": 482186931.0, + "step": 12634 + }, + { + "epoch": 1.6073018699910953, + "grad_norm": 1.7008230686187744, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8707798719406128, + "num_tokens": 482224491.0, + "step": 12635 + }, + { + "epoch": 1.6074290802696858, + "grad_norm": 1.5150359869003296, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8703697323799133, + "num_tokens": 482267014.0, + "step": 12636 + }, + { + "epoch": 1.6075562905482763, + "grad_norm": 1.5185562372207642, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8830454349517822, + "num_tokens": 482305659.0, + "step": 12637 + }, + { + "epoch": 1.6076835008268668, + "grad_norm": 1.5517784357070923, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8881456255912781, + "num_tokens": 482344186.0, + "step": 12638 + }, + { + "epoch": 1.6078107111054574, + "grad_norm": 1.6604840755462646, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8680541515350342, + "num_tokens": 482377582.0, + "step": 12639 + }, + { + "epoch": 1.607937921384048, + "grad_norm": 1.595924735069275, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8782731890678406, + "num_tokens": 482413495.0, + "step": 12640 + }, + { + "epoch": 1.6080651316626384, + "grad_norm": 1.4979419708251953, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8706647753715515, + "num_tokens": 482457090.0, + "step": 12641 + }, + { + "epoch": 1.608192341941229, + "grad_norm": 1.306382179260254, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8883688449859619, + "num_tokens": 482503608.0, + "step": 12642 + }, + { + "epoch": 1.6083195522198195, + "grad_norm": 1.4302735328674316, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8818122148513794, + "num_tokens": 482543769.0, + "step": 12643 + }, + { + "epoch": 1.60844676249841, + "grad_norm": 1.5481969118118286, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8775924444198608, + "num_tokens": 482580925.0, + "step": 12644 + }, + { + "epoch": 1.6085739727770005, + "grad_norm": 1.5360904932022095, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8676405549049377, + "num_tokens": 482619627.0, + "step": 12645 + }, + { + "epoch": 1.6087011830555908, + "grad_norm": 1.5602754354476929, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8759552240371704, + "num_tokens": 482655572.0, + "step": 12646 + }, + { + "epoch": 1.6088283933341814, + "grad_norm": 1.601638913154602, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8765240907669067, + "num_tokens": 482689568.0, + "step": 12647 + }, + { + "epoch": 1.608955603612772, + "grad_norm": 1.456466555595398, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8783788681030273, + "num_tokens": 482727719.0, + "step": 12648 + }, + { + "epoch": 1.6090828138913624, + "grad_norm": 1.5545884370803833, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.883043646812439, + "num_tokens": 482763861.0, + "step": 12649 + }, + { + "epoch": 1.609210024169953, + "grad_norm": 1.5782607793807983, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8764374852180481, + "num_tokens": 482796929.0, + "step": 12650 + }, + { + "epoch": 1.6093372344485435, + "grad_norm": 1.402152419090271, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8824799060821533, + "num_tokens": 482834898.0, + "step": 12651 + }, + { + "epoch": 1.6094644447271338, + "grad_norm": 1.5087201595306396, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8723819255828857, + "num_tokens": 482873605.0, + "step": 12652 + }, + { + "epoch": 1.6095916550057243, + "grad_norm": 1.5454286336898804, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8674217462539673, + "num_tokens": 482910112.0, + "step": 12653 + }, + { + "epoch": 1.6097188652843148, + "grad_norm": 1.4026180505752563, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8849285244941711, + "num_tokens": 482949682.0, + "step": 12654 + }, + { + "epoch": 1.6098460755629054, + "grad_norm": 1.4550517797470093, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8874794244766235, + "num_tokens": 482985081.0, + "step": 12655 + }, + { + "epoch": 1.609973285841496, + "grad_norm": 1.5030713081359863, + "learning_rate": 1e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.8969480991363525, + "num_tokens": 483019480.0, + "step": 12656 + }, + { + "epoch": 1.6101004961200864, + "grad_norm": 1.5255379676818848, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8806426525115967, + "num_tokens": 483055970.0, + "step": 12657 + }, + { + "epoch": 1.610227706398677, + "grad_norm": 1.5757205486297607, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8880459070205688, + "num_tokens": 483089870.0, + "step": 12658 + }, + { + "epoch": 1.6103549166772675, + "grad_norm": 1.5018154382705688, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8835697174072266, + "num_tokens": 483125164.0, + "step": 12659 + }, + { + "epoch": 1.610482126955858, + "grad_norm": 1.4363406896591187, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.882479190826416, + "num_tokens": 483164069.0, + "step": 12660 + }, + { + "epoch": 1.6106093372344485, + "grad_norm": 1.4114187955856323, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8882924318313599, + "num_tokens": 483205335.0, + "step": 12661 + }, + { + "epoch": 1.610736547513039, + "grad_norm": 1.5242308378219604, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8662629127502441, + "num_tokens": 483244185.0, + "step": 12662 + }, + { + "epoch": 1.6108637577916296, + "grad_norm": 1.4700480699539185, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8817856311798096, + "num_tokens": 483282054.0, + "step": 12663 + }, + { + "epoch": 1.6109909680702201, + "grad_norm": 1.5725599527359009, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8611070513725281, + "num_tokens": 483320993.0, + "step": 12664 + }, + { + "epoch": 1.6111181783488107, + "grad_norm": 1.575437307357788, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8703840374946594, + "num_tokens": 483356724.0, + "step": 12665 + }, + { + "epoch": 1.6112453886274012, + "grad_norm": 1.4210036993026733, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8674792647361755, + "num_tokens": 483401262.0, + "step": 12666 + }, + { + "epoch": 1.6113725989059917, + "grad_norm": 1.3514769077301025, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8748934268951416, + "num_tokens": 483445263.0, + "step": 12667 + }, + { + "epoch": 1.6114998091845822, + "grad_norm": 1.401930809020996, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8882029056549072, + "num_tokens": 483483159.0, + "step": 12668 + }, + { + "epoch": 1.6116270194631728, + "grad_norm": 1.3994108438491821, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8731883764266968, + "num_tokens": 483526932.0, + "step": 12669 + }, + { + "epoch": 1.6117542297417633, + "grad_norm": 1.5514869689941406, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8766461610794067, + "num_tokens": 483563658.0, + "step": 12670 + }, + { + "epoch": 1.6118814400203536, + "grad_norm": 1.5453729629516602, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8770668506622314, + "num_tokens": 483599856.0, + "step": 12671 + }, + { + "epoch": 1.6120086502989441, + "grad_norm": 1.539137601852417, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.883236289024353, + "num_tokens": 483634884.0, + "step": 12672 + }, + { + "epoch": 1.6121358605775347, + "grad_norm": 1.5169163942337036, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8757301568984985, + "num_tokens": 483677616.0, + "step": 12673 + }, + { + "epoch": 1.6122630708561252, + "grad_norm": 1.5214650630950928, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8805488348007202, + "num_tokens": 483711369.0, + "step": 12674 + }, + { + "epoch": 1.6123902811347157, + "grad_norm": 1.563801884651184, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8679550886154175, + "num_tokens": 483748947.0, + "step": 12675 + }, + { + "epoch": 1.6125174914133062, + "grad_norm": 1.3474873304367065, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8780192136764526, + "num_tokens": 483793391.0, + "step": 12676 + }, + { + "epoch": 1.6126447016918966, + "grad_norm": 1.4409788846969604, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8831048011779785, + "num_tokens": 483834322.0, + "step": 12677 + }, + { + "epoch": 1.612771911970487, + "grad_norm": 1.5545916557312012, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8874740600585938, + "num_tokens": 483866912.0, + "step": 12678 + }, + { + "epoch": 1.6128991222490776, + "grad_norm": 1.5200196504592896, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8775278329849243, + "num_tokens": 483904782.0, + "step": 12679 + }, + { + "epoch": 1.6130263325276681, + "grad_norm": 1.5009756088256836, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.876680850982666, + "num_tokens": 483945033.0, + "step": 12680 + }, + { + "epoch": 1.6131535428062587, + "grad_norm": 1.6502139568328857, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8805761337280273, + "num_tokens": 483981251.0, + "step": 12681 + }, + { + "epoch": 1.6132807530848492, + "grad_norm": 1.4442590475082397, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8709834218025208, + "num_tokens": 484021501.0, + "step": 12682 + }, + { + "epoch": 1.6134079633634397, + "grad_norm": 1.4560927152633667, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8714002370834351, + "num_tokens": 484062272.0, + "step": 12683 + }, + { + "epoch": 1.6135351736420303, + "grad_norm": 1.4377374649047852, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8966035842895508, + "num_tokens": 484100440.0, + "step": 12684 + }, + { + "epoch": 1.6136623839206208, + "grad_norm": 1.5535928010940552, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8602439761161804, + "num_tokens": 484137163.0, + "step": 12685 + }, + { + "epoch": 1.6137895941992113, + "grad_norm": 1.4253184795379639, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8924731612205505, + "num_tokens": 484173243.0, + "step": 12686 + }, + { + "epoch": 1.6139168044778018, + "grad_norm": 1.4369534254074097, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8775395750999451, + "num_tokens": 484217244.0, + "step": 12687 + }, + { + "epoch": 1.6140440147563924, + "grad_norm": 1.4965952634811401, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8714471459388733, + "num_tokens": 484255435.0, + "step": 12688 + }, + { + "epoch": 1.614171225034983, + "grad_norm": 1.6676276922225952, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8783941864967346, + "num_tokens": 484289582.0, + "step": 12689 + }, + { + "epoch": 1.6142984353135734, + "grad_norm": 1.4997291564941406, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8878780007362366, + "num_tokens": 484324360.0, + "step": 12690 + }, + { + "epoch": 1.614425645592164, + "grad_norm": 1.3763545751571655, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8843761682510376, + "num_tokens": 484369302.0, + "step": 12691 + }, + { + "epoch": 1.6145528558707545, + "grad_norm": 1.5627456903457642, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8800979852676392, + "num_tokens": 484406591.0, + "step": 12692 + }, + { + "epoch": 1.614680066149345, + "grad_norm": 1.3414335250854492, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8811800479888916, + "num_tokens": 484449864.0, + "step": 12693 + }, + { + "epoch": 1.6148072764279355, + "grad_norm": 1.4393643140792847, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8887826204299927, + "num_tokens": 484485261.0, + "step": 12694 + }, + { + "epoch": 1.6149344867065258, + "grad_norm": 1.4829381704330444, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8793323040008545, + "num_tokens": 484525187.0, + "step": 12695 + }, + { + "epoch": 1.6150616969851164, + "grad_norm": 1.4051544666290283, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8691242933273315, + "num_tokens": 484570427.0, + "step": 12696 + }, + { + "epoch": 1.615188907263707, + "grad_norm": 1.458984136581421, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8929204940795898, + "num_tokens": 484607399.0, + "step": 12697 + }, + { + "epoch": 1.6153161175422974, + "grad_norm": 1.3690030574798584, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8886235952377319, + "num_tokens": 484648355.0, + "step": 12698 + }, + { + "epoch": 1.615443327820888, + "grad_norm": 1.6260402202606201, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8544498682022095, + "num_tokens": 484689898.0, + "step": 12699 + }, + { + "epoch": 1.6155705380994785, + "grad_norm": 1.5527371168136597, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8821543455123901, + "num_tokens": 484727565.0, + "step": 12700 + }, + { + "epoch": 1.6156977483780688, + "grad_norm": 1.5036793947219849, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8784521818161011, + "num_tokens": 484767847.0, + "step": 12701 + }, + { + "epoch": 1.6158249586566593, + "grad_norm": 1.5209853649139404, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8788425922393799, + "num_tokens": 484804243.0, + "step": 12702 + }, + { + "epoch": 1.6159521689352498, + "grad_norm": 1.4504363536834717, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8772487640380859, + "num_tokens": 484845686.0, + "step": 12703 + }, + { + "epoch": 1.6160793792138404, + "grad_norm": 1.4287452697753906, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8717334270477295, + "num_tokens": 484886989.0, + "step": 12704 + }, + { + "epoch": 1.616206589492431, + "grad_norm": 1.5202653408050537, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8651397228240967, + "num_tokens": 484925182.0, + "step": 12705 + }, + { + "epoch": 1.6163337997710214, + "grad_norm": 1.4410392045974731, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8802065849304199, + "num_tokens": 484964325.0, + "step": 12706 + }, + { + "epoch": 1.616461010049612, + "grad_norm": 1.509352684020996, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8839066028594971, + "num_tokens": 485000216.0, + "step": 12707 + }, + { + "epoch": 1.6165882203282025, + "grad_norm": 1.4915335178375244, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.868640661239624, + "num_tokens": 485038203.0, + "step": 12708 + }, + { + "epoch": 1.616715430606793, + "grad_norm": 1.6283440589904785, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8835089802742004, + "num_tokens": 485070938.0, + "step": 12709 + }, + { + "epoch": 1.6168426408853835, + "grad_norm": 1.5463324785232544, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8530689477920532, + "num_tokens": 485111468.0, + "step": 12710 + }, + { + "epoch": 1.616969851163974, + "grad_norm": 1.4982671737670898, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8632442951202393, + "num_tokens": 485147646.0, + "step": 12711 + }, + { + "epoch": 1.6170970614425646, + "grad_norm": 1.4984784126281738, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8749853372573853, + "num_tokens": 485186904.0, + "step": 12712 + }, + { + "epoch": 1.6172242717211551, + "grad_norm": 1.5620609521865845, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8936985731124878, + "num_tokens": 485222111.0, + "step": 12713 + }, + { + "epoch": 1.6173514819997457, + "grad_norm": 1.4958537817001343, + "learning_rate": 1e-06, + "loss": 0.2782, + "mean_token_accuracy": 0.903174877166748, + "num_tokens": 485256385.0, + "step": 12714 + }, + { + "epoch": 1.6174786922783362, + "grad_norm": 1.4236571788787842, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8778415322303772, + "num_tokens": 485297181.0, + "step": 12715 + }, + { + "epoch": 1.6176059025569267, + "grad_norm": 1.4692999124526978, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8675304651260376, + "num_tokens": 485335786.0, + "step": 12716 + }, + { + "epoch": 1.6177331128355172, + "grad_norm": 1.511459469795227, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8659536838531494, + "num_tokens": 485374172.0, + "step": 12717 + }, + { + "epoch": 1.6178603231141078, + "grad_norm": 1.4528861045837402, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8658249974250793, + "num_tokens": 485416148.0, + "step": 12718 + }, + { + "epoch": 1.6179875333926983, + "grad_norm": 1.6119602918624878, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8643960952758789, + "num_tokens": 485451965.0, + "step": 12719 + }, + { + "epoch": 1.6181147436712886, + "grad_norm": 1.3207608461380005, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8737842440605164, + "num_tokens": 485498503.0, + "step": 12720 + }, + { + "epoch": 1.6182419539498791, + "grad_norm": 1.4653236865997314, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8772504925727844, + "num_tokens": 485538692.0, + "step": 12721 + }, + { + "epoch": 1.6183691642284697, + "grad_norm": 1.6562596559524536, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.869803249835968, + "num_tokens": 485578135.0, + "step": 12722 + }, + { + "epoch": 1.6184963745070602, + "grad_norm": 1.5277340412139893, + "learning_rate": 1e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.8960065245628357, + "num_tokens": 485609880.0, + "step": 12723 + }, + { + "epoch": 1.6186235847856507, + "grad_norm": 1.7273285388946533, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8632069230079651, + "num_tokens": 485642218.0, + "step": 12724 + }, + { + "epoch": 1.6187507950642412, + "grad_norm": 1.436373233795166, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8811560869216919, + "num_tokens": 485684610.0, + "step": 12725 + }, + { + "epoch": 1.6188780053428315, + "grad_norm": 1.4621769189834595, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8739704489707947, + "num_tokens": 485726178.0, + "step": 12726 + }, + { + "epoch": 1.619005215621422, + "grad_norm": 1.3780934810638428, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8787747621536255, + "num_tokens": 485769214.0, + "step": 12727 + }, + { + "epoch": 1.6191324259000126, + "grad_norm": 1.5353857278823853, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8797630071640015, + "num_tokens": 485804900.0, + "step": 12728 + }, + { + "epoch": 1.6192596361786031, + "grad_norm": 1.5315309762954712, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8902965784072876, + "num_tokens": 485842289.0, + "step": 12729 + }, + { + "epoch": 1.6193868464571937, + "grad_norm": 1.5620675086975098, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8746863007545471, + "num_tokens": 485879496.0, + "step": 12730 + }, + { + "epoch": 1.6195140567357842, + "grad_norm": 1.6147465705871582, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8817490339279175, + "num_tokens": 485913844.0, + "step": 12731 + }, + { + "epoch": 1.6196412670143747, + "grad_norm": 1.5170849561691284, + "learning_rate": 1e-06, + "loss": 0.2642, + "mean_token_accuracy": 0.9036875367164612, + "num_tokens": 485950361.0, + "step": 12732 + }, + { + "epoch": 1.6197684772929652, + "grad_norm": 1.4620342254638672, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8720797896385193, + "num_tokens": 485987611.0, + "step": 12733 + }, + { + "epoch": 1.6198956875715558, + "grad_norm": 1.5031877756118774, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8736516833305359, + "num_tokens": 486028010.0, + "step": 12734 + }, + { + "epoch": 1.6200228978501463, + "grad_norm": 1.5745090246200562, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8640342354774475, + "num_tokens": 486065140.0, + "step": 12735 + }, + { + "epoch": 1.6201501081287368, + "grad_norm": 1.5113343000411987, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8797398805618286, + "num_tokens": 486103620.0, + "step": 12736 + }, + { + "epoch": 1.6202773184073274, + "grad_norm": 1.5488296747207642, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8716939687728882, + "num_tokens": 486138477.0, + "step": 12737 + }, + { + "epoch": 1.6204045286859179, + "grad_norm": 1.5940022468566895, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8727539777755737, + "num_tokens": 486176769.0, + "step": 12738 + }, + { + "epoch": 1.6205317389645084, + "grad_norm": 1.7252261638641357, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8541826009750366, + "num_tokens": 486211389.0, + "step": 12739 + }, + { + "epoch": 1.620658949243099, + "grad_norm": 1.5781694650650024, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8890917897224426, + "num_tokens": 486247435.0, + "step": 12740 + }, + { + "epoch": 1.6207861595216895, + "grad_norm": 1.4534353017807007, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8605508804321289, + "num_tokens": 486293329.0, + "step": 12741 + }, + { + "epoch": 1.62091336980028, + "grad_norm": 1.4853187799453735, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8907297849655151, + "num_tokens": 486328659.0, + "step": 12742 + }, + { + "epoch": 1.6210405800788705, + "grad_norm": 1.5585590600967407, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8793103694915771, + "num_tokens": 486364594.0, + "step": 12743 + }, + { + "epoch": 1.6211677903574608, + "grad_norm": 1.6256673336029053, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8782215118408203, + "num_tokens": 486399809.0, + "step": 12744 + }, + { + "epoch": 1.6212950006360514, + "grad_norm": 1.5853599309921265, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8809635639190674, + "num_tokens": 486433261.0, + "step": 12745 + }, + { + "epoch": 1.621422210914642, + "grad_norm": 1.4610416889190674, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8777856826782227, + "num_tokens": 486472413.0, + "step": 12746 + }, + { + "epoch": 1.6215494211932324, + "grad_norm": 1.4359935522079468, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8815135955810547, + "num_tokens": 486513446.0, + "step": 12747 + }, + { + "epoch": 1.621676631471823, + "grad_norm": 1.5454330444335938, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8678946495056152, + "num_tokens": 486551089.0, + "step": 12748 + }, + { + "epoch": 1.6218038417504135, + "grad_norm": 1.5256130695343018, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8657139539718628, + "num_tokens": 486589252.0, + "step": 12749 + }, + { + "epoch": 1.6219310520290038, + "grad_norm": 1.4030954837799072, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8791514039039612, + "num_tokens": 486630516.0, + "step": 12750 + }, + { + "epoch": 1.6220582623075943, + "grad_norm": 1.3723986148834229, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8780996799468994, + "num_tokens": 486672184.0, + "step": 12751 + }, + { + "epoch": 1.6221854725861848, + "grad_norm": 1.4550071954727173, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.871908962726593, + "num_tokens": 486714877.0, + "step": 12752 + }, + { + "epoch": 1.6223126828647754, + "grad_norm": 1.5435538291931152, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8844438791275024, + "num_tokens": 486750455.0, + "step": 12753 + }, + { + "epoch": 1.622439893143366, + "grad_norm": 1.5336558818817139, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8661997318267822, + "num_tokens": 486786645.0, + "step": 12754 + }, + { + "epoch": 1.6225671034219564, + "grad_norm": 1.50382661819458, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8894779682159424, + "num_tokens": 486823024.0, + "step": 12755 + }, + { + "epoch": 1.622694313700547, + "grad_norm": 1.4944287538528442, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8824545741081238, + "num_tokens": 486860827.0, + "step": 12756 + }, + { + "epoch": 1.6228215239791375, + "grad_norm": 1.5398485660552979, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8766645193099976, + "num_tokens": 486895747.0, + "step": 12757 + }, + { + "epoch": 1.622948734257728, + "grad_norm": 1.3675626516342163, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8769952654838562, + "num_tokens": 486940943.0, + "step": 12758 + }, + { + "epoch": 1.6230759445363185, + "grad_norm": 1.5755071640014648, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8853676319122314, + "num_tokens": 486975436.0, + "step": 12759 + }, + { + "epoch": 1.623203154814909, + "grad_norm": 1.5237185955047607, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8825010657310486, + "num_tokens": 487009916.0, + "step": 12760 + }, + { + "epoch": 1.6233303650934996, + "grad_norm": 1.52688467502594, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8784074783325195, + "num_tokens": 487045868.0, + "step": 12761 + }, + { + "epoch": 1.6234575753720901, + "grad_norm": 1.5089198350906372, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8693112730979919, + "num_tokens": 487085898.0, + "step": 12762 + }, + { + "epoch": 1.6235847856506807, + "grad_norm": 1.423295259475708, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8721210956573486, + "num_tokens": 487129228.0, + "step": 12763 + }, + { + "epoch": 1.6237119959292712, + "grad_norm": 1.4777106046676636, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8792716264724731, + "num_tokens": 487167737.0, + "step": 12764 + }, + { + "epoch": 1.6238392062078617, + "grad_norm": 1.5666128396987915, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8731435537338257, + "num_tokens": 487202572.0, + "step": 12765 + }, + { + "epoch": 1.6239664164864522, + "grad_norm": 1.5277845859527588, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8790622353553772, + "num_tokens": 487238519.0, + "step": 12766 + }, + { + "epoch": 1.6240936267650428, + "grad_norm": 1.4564911127090454, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8905494809150696, + "num_tokens": 487278636.0, + "step": 12767 + }, + { + "epoch": 1.6242208370436333, + "grad_norm": 1.3901944160461426, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8891555666923523, + "num_tokens": 487318650.0, + "step": 12768 + }, + { + "epoch": 1.6243480473222236, + "grad_norm": 1.523356318473816, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8884023427963257, + "num_tokens": 487352006.0, + "step": 12769 + }, + { + "epoch": 1.6244752576008141, + "grad_norm": 1.57108736038208, + "learning_rate": 1e-06, + "loss": 0.2867, + "mean_token_accuracy": 0.8969223499298096, + "num_tokens": 487382768.0, + "step": 12770 + }, + { + "epoch": 1.6246024678794047, + "grad_norm": 1.4292893409729004, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8770543932914734, + "num_tokens": 487423736.0, + "step": 12771 + }, + { + "epoch": 1.6247296781579952, + "grad_norm": 1.4503623247146606, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8678877353668213, + "num_tokens": 487464631.0, + "step": 12772 + }, + { + "epoch": 1.6248568884365857, + "grad_norm": 1.4159908294677734, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8851680755615234, + "num_tokens": 487501901.0, + "step": 12773 + }, + { + "epoch": 1.6249840987151762, + "grad_norm": 1.4050477743148804, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8844068050384521, + "num_tokens": 487538712.0, + "step": 12774 + }, + { + "epoch": 1.6251113089937665, + "grad_norm": 1.352316975593567, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8891878128051758, + "num_tokens": 487579170.0, + "step": 12775 + }, + { + "epoch": 1.625238519272357, + "grad_norm": 1.471062421798706, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8763005137443542, + "num_tokens": 487618511.0, + "step": 12776 + }, + { + "epoch": 1.6253657295509476, + "grad_norm": 1.4044822454452515, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8879508972167969, + "num_tokens": 487656625.0, + "step": 12777 + }, + { + "epoch": 1.6254929398295381, + "grad_norm": 1.642846703529358, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.870280385017395, + "num_tokens": 487689857.0, + "step": 12778 + }, + { + "epoch": 1.6256201501081287, + "grad_norm": 1.435394525527954, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8797705769538879, + "num_tokens": 487733626.0, + "step": 12779 + }, + { + "epoch": 1.6257473603867192, + "grad_norm": 1.4804197549819946, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8683858513832092, + "num_tokens": 487771895.0, + "step": 12780 + }, + { + "epoch": 1.6258745706653097, + "grad_norm": 1.639224886894226, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8694630265235901, + "num_tokens": 487808603.0, + "step": 12781 + }, + { + "epoch": 1.6260017809439002, + "grad_norm": 1.3401799201965332, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8904861211776733, + "num_tokens": 487851347.0, + "step": 12782 + }, + { + "epoch": 1.6261289912224908, + "grad_norm": 1.431896686553955, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8689072132110596, + "num_tokens": 487892783.0, + "step": 12783 + }, + { + "epoch": 1.6262562015010813, + "grad_norm": 1.6348915100097656, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8660361170768738, + "num_tokens": 487929258.0, + "step": 12784 + }, + { + "epoch": 1.6263834117796718, + "grad_norm": 1.5576835870742798, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8722387552261353, + "num_tokens": 487966398.0, + "step": 12785 + }, + { + "epoch": 1.6265106220582624, + "grad_norm": 1.4783070087432861, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8740851283073425, + "num_tokens": 488007166.0, + "step": 12786 + }, + { + "epoch": 1.6266378323368529, + "grad_norm": 1.5511151552200317, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8773216009140015, + "num_tokens": 488043144.0, + "step": 12787 + }, + { + "epoch": 1.6267650426154434, + "grad_norm": 1.4563356637954712, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8743734359741211, + "num_tokens": 488083482.0, + "step": 12788 + }, + { + "epoch": 1.626892252894034, + "grad_norm": 1.4937740564346313, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8773558139801025, + "num_tokens": 488123319.0, + "step": 12789 + }, + { + "epoch": 1.6270194631726245, + "grad_norm": 1.5067212581634521, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8892408013343811, + "num_tokens": 488158366.0, + "step": 12790 + }, + { + "epoch": 1.627146673451215, + "grad_norm": 1.5418134927749634, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8517922163009644, + "num_tokens": 488199695.0, + "step": 12791 + }, + { + "epoch": 1.6272738837298055, + "grad_norm": 1.4524421691894531, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8751547932624817, + "num_tokens": 488245910.0, + "step": 12792 + }, + { + "epoch": 1.6274010940083958, + "grad_norm": 1.6621488332748413, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8898179531097412, + "num_tokens": 488273813.0, + "step": 12793 + }, + { + "epoch": 1.6275283042869864, + "grad_norm": 1.488448977470398, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8760411143302917, + "num_tokens": 488310978.0, + "step": 12794 + }, + { + "epoch": 1.6276555145655769, + "grad_norm": 1.4409656524658203, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8892801403999329, + "num_tokens": 488346614.0, + "step": 12795 + }, + { + "epoch": 1.6277827248441674, + "grad_norm": 1.464176893234253, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8729469776153564, + "num_tokens": 488387434.0, + "step": 12796 + }, + { + "epoch": 1.627909935122758, + "grad_norm": 1.6604610681533813, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8789710998535156, + "num_tokens": 488422139.0, + "step": 12797 + }, + { + "epoch": 1.6280371454013485, + "grad_norm": 1.5647064447402954, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8699342608451843, + "num_tokens": 488457415.0, + "step": 12798 + }, + { + "epoch": 1.6281643556799388, + "grad_norm": 1.8327549695968628, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8615382313728333, + "num_tokens": 488486503.0, + "step": 12799 + }, + { + "epoch": 1.6282915659585293, + "grad_norm": 1.555145502090454, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8812639117240906, + "num_tokens": 488522136.0, + "step": 12800 + }, + { + "epoch": 1.6284187762371198, + "grad_norm": 1.594347357749939, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.881310760974884, + "num_tokens": 488559827.0, + "step": 12801 + }, + { + "epoch": 1.6285459865157104, + "grad_norm": 1.4921568632125854, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8702210783958435, + "num_tokens": 488598723.0, + "step": 12802 + }, + { + "epoch": 1.628673196794301, + "grad_norm": 1.4614403247833252, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8801937699317932, + "num_tokens": 488637160.0, + "step": 12803 + }, + { + "epoch": 1.6288004070728914, + "grad_norm": 1.540091872215271, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8862407803535461, + "num_tokens": 488671137.0, + "step": 12804 + }, + { + "epoch": 1.628927617351482, + "grad_norm": 1.6443160772323608, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8734194040298462, + "num_tokens": 488701891.0, + "step": 12805 + }, + { + "epoch": 1.6290548276300725, + "grad_norm": 1.5575352907180786, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8789169788360596, + "num_tokens": 488738591.0, + "step": 12806 + }, + { + "epoch": 1.629182037908663, + "grad_norm": 1.444150447845459, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.887633204460144, + "num_tokens": 488776246.0, + "step": 12807 + }, + { + "epoch": 1.6293092481872535, + "grad_norm": 1.452080488204956, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8763558864593506, + "num_tokens": 488816448.0, + "step": 12808 + }, + { + "epoch": 1.629436458465844, + "grad_norm": 1.525073766708374, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8780868053436279, + "num_tokens": 488851755.0, + "step": 12809 + }, + { + "epoch": 1.6295636687444346, + "grad_norm": 1.6101309061050415, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8785309791564941, + "num_tokens": 488886712.0, + "step": 12810 + }, + { + "epoch": 1.6296908790230251, + "grad_norm": 1.5414820909500122, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8611624240875244, + "num_tokens": 488928024.0, + "step": 12811 + }, + { + "epoch": 1.6298180893016156, + "grad_norm": 1.4570549726486206, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8812283277511597, + "num_tokens": 488966525.0, + "step": 12812 + }, + { + "epoch": 1.6299452995802062, + "grad_norm": 1.5639448165893555, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8844952583312988, + "num_tokens": 489002470.0, + "step": 12813 + }, + { + "epoch": 1.6300725098587967, + "grad_norm": 1.5696237087249756, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8818678855895996, + "num_tokens": 489040006.0, + "step": 12814 + }, + { + "epoch": 1.6301997201373872, + "grad_norm": 1.4438908100128174, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8864759206771851, + "num_tokens": 489079339.0, + "step": 12815 + }, + { + "epoch": 1.6303269304159778, + "grad_norm": 1.3683671951293945, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8821849822998047, + "num_tokens": 489121104.0, + "step": 12816 + }, + { + "epoch": 1.630454140694568, + "grad_norm": 1.4614750146865845, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8810783624649048, + "num_tokens": 489161654.0, + "step": 12817 + }, + { + "epoch": 1.6305813509731586, + "grad_norm": 1.5286409854888916, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8769465684890747, + "num_tokens": 489200035.0, + "step": 12818 + }, + { + "epoch": 1.6307085612517491, + "grad_norm": 1.447310447692871, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8896028995513916, + "num_tokens": 489235471.0, + "step": 12819 + }, + { + "epoch": 1.6308357715303397, + "grad_norm": 1.4792656898498535, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8918279409408569, + "num_tokens": 489273790.0, + "step": 12820 + }, + { + "epoch": 1.6309629818089302, + "grad_norm": 1.675397515296936, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8683252930641174, + "num_tokens": 489310794.0, + "step": 12821 + }, + { + "epoch": 1.6310901920875207, + "grad_norm": 1.3308920860290527, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8729410171508789, + "num_tokens": 489363590.0, + "step": 12822 + }, + { + "epoch": 1.6312174023661112, + "grad_norm": 1.441904902458191, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8760350346565247, + "num_tokens": 489403860.0, + "step": 12823 + }, + { + "epoch": 1.6313446126447015, + "grad_norm": 1.2982991933822632, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8826514482498169, + "num_tokens": 489449324.0, + "step": 12824 + }, + { + "epoch": 1.631471822923292, + "grad_norm": 1.5657527446746826, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8723661303520203, + "num_tokens": 489485670.0, + "step": 12825 + }, + { + "epoch": 1.6315990332018826, + "grad_norm": 1.442481279373169, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8735620975494385, + "num_tokens": 489528971.0, + "step": 12826 + }, + { + "epoch": 1.6317262434804731, + "grad_norm": 1.4635260105133057, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8783676624298096, + "num_tokens": 489566247.0, + "step": 12827 + }, + { + "epoch": 1.6318534537590637, + "grad_norm": 1.539015531539917, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.874033510684967, + "num_tokens": 489602494.0, + "step": 12828 + }, + { + "epoch": 1.6319806640376542, + "grad_norm": 1.434378981590271, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8738453388214111, + "num_tokens": 489641883.0, + "step": 12829 + }, + { + "epoch": 1.6321078743162447, + "grad_norm": 1.427162528038025, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8715449571609497, + "num_tokens": 489682096.0, + "step": 12830 + }, + { + "epoch": 1.6322350845948352, + "grad_norm": 1.4636704921722412, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8758057355880737, + "num_tokens": 489720609.0, + "step": 12831 + }, + { + "epoch": 1.6323622948734258, + "grad_norm": 1.421591877937317, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8636021614074707, + "num_tokens": 489762820.0, + "step": 12832 + }, + { + "epoch": 1.6324895051520163, + "grad_norm": 1.4366824626922607, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8808690905570984, + "num_tokens": 489803163.0, + "step": 12833 + }, + { + "epoch": 1.6326167154306068, + "grad_norm": 1.578433871269226, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8875981569290161, + "num_tokens": 489839893.0, + "step": 12834 + }, + { + "epoch": 1.6327439257091974, + "grad_norm": 1.5501277446746826, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8613038063049316, + "num_tokens": 489878572.0, + "step": 12835 + }, + { + "epoch": 1.6328711359877879, + "grad_norm": 1.5831875801086426, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8659663796424866, + "num_tokens": 489912910.0, + "step": 12836 + }, + { + "epoch": 1.6329983462663784, + "grad_norm": 1.3629659414291382, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8755331039428711, + "num_tokens": 489954212.0, + "step": 12837 + }, + { + "epoch": 1.633125556544969, + "grad_norm": 1.5471144914627075, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8813504576683044, + "num_tokens": 489987983.0, + "step": 12838 + }, + { + "epoch": 1.6332527668235595, + "grad_norm": 1.39421546459198, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8791137933731079, + "num_tokens": 490028652.0, + "step": 12839 + }, + { + "epoch": 1.63337997710215, + "grad_norm": 1.4393832683563232, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8767802119255066, + "num_tokens": 490072933.0, + "step": 12840 + }, + { + "epoch": 1.6335071873807405, + "grad_norm": 1.6453979015350342, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.870295524597168, + "num_tokens": 490107421.0, + "step": 12841 + }, + { + "epoch": 1.6336343976593308, + "grad_norm": 1.508947491645813, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8591839671134949, + "num_tokens": 490147999.0, + "step": 12842 + }, + { + "epoch": 1.6337616079379214, + "grad_norm": 1.5313472747802734, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8802490234375, + "num_tokens": 490183428.0, + "step": 12843 + }, + { + "epoch": 1.6338888182165119, + "grad_norm": 1.4750643968582153, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8756425380706787, + "num_tokens": 490224447.0, + "step": 12844 + }, + { + "epoch": 1.6340160284951024, + "grad_norm": 1.587867021560669, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.880877673625946, + "num_tokens": 490258182.0, + "step": 12845 + }, + { + "epoch": 1.634143238773693, + "grad_norm": 1.4174662828445435, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8744788765907288, + "num_tokens": 490300595.0, + "step": 12846 + }, + { + "epoch": 1.6342704490522835, + "grad_norm": 1.4407626390457153, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8839586973190308, + "num_tokens": 490338797.0, + "step": 12847 + }, + { + "epoch": 1.6343976593308738, + "grad_norm": 1.445028305053711, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8771784901618958, + "num_tokens": 490380536.0, + "step": 12848 + }, + { + "epoch": 1.6345248696094643, + "grad_norm": 1.6298452615737915, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8744778633117676, + "num_tokens": 490419662.0, + "step": 12849 + }, + { + "epoch": 1.6346520798880548, + "grad_norm": 1.4973349571228027, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8786710500717163, + "num_tokens": 490455353.0, + "step": 12850 + }, + { + "epoch": 1.6347792901666454, + "grad_norm": 1.4656970500946045, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8868311643600464, + "num_tokens": 490491156.0, + "step": 12851 + }, + { + "epoch": 1.6349065004452359, + "grad_norm": 1.5424636602401733, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8786885738372803, + "num_tokens": 490528625.0, + "step": 12852 + }, + { + "epoch": 1.6350337107238264, + "grad_norm": 1.5123234987258911, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8757578134536743, + "num_tokens": 490567682.0, + "step": 12853 + }, + { + "epoch": 1.635160921002417, + "grad_norm": 1.5707693099975586, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8740205764770508, + "num_tokens": 490606930.0, + "step": 12854 + }, + { + "epoch": 1.6352881312810075, + "grad_norm": 1.5391868352890015, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8728263974189758, + "num_tokens": 490643094.0, + "step": 12855 + }, + { + "epoch": 1.635415341559598, + "grad_norm": 1.4470824003219604, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.871610164642334, + "num_tokens": 490683282.0, + "step": 12856 + }, + { + "epoch": 1.6355425518381885, + "grad_norm": 1.5294784307479858, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8834513425827026, + "num_tokens": 490715983.0, + "step": 12857 + }, + { + "epoch": 1.635669762116779, + "grad_norm": 1.6049102544784546, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.865700364112854, + "num_tokens": 490753437.0, + "step": 12858 + }, + { + "epoch": 1.6357969723953696, + "grad_norm": 1.6431643962860107, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8641201257705688, + "num_tokens": 490789817.0, + "step": 12859 + }, + { + "epoch": 1.6359241826739601, + "grad_norm": 1.5585153102874756, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.878699779510498, + "num_tokens": 490825832.0, + "step": 12860 + }, + { + "epoch": 1.6360513929525506, + "grad_norm": 1.5233676433563232, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.878027081489563, + "num_tokens": 490860793.0, + "step": 12861 + }, + { + "epoch": 1.6361786032311412, + "grad_norm": 1.6207139492034912, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8682405352592468, + "num_tokens": 490895455.0, + "step": 12862 + }, + { + "epoch": 1.6363058135097317, + "grad_norm": 1.536365270614624, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8772934675216675, + "num_tokens": 490933957.0, + "step": 12863 + }, + { + "epoch": 1.6364330237883222, + "grad_norm": 1.5295077562332153, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8925292491912842, + "num_tokens": 490965206.0, + "step": 12864 + }, + { + "epoch": 1.6365602340669128, + "grad_norm": 1.639973759651184, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8825135231018066, + "num_tokens": 490996007.0, + "step": 12865 + }, + { + "epoch": 1.636687444345503, + "grad_norm": 1.5116374492645264, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8726649284362793, + "num_tokens": 491035902.0, + "step": 12866 + }, + { + "epoch": 1.6368146546240936, + "grad_norm": 1.5287052392959595, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.886934757232666, + "num_tokens": 491074389.0, + "step": 12867 + }, + { + "epoch": 1.6369418649026841, + "grad_norm": 1.5012205839157104, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8855140209197998, + "num_tokens": 491113066.0, + "step": 12868 + }, + { + "epoch": 1.6370690751812746, + "grad_norm": 1.5909124612808228, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8782223463058472, + "num_tokens": 491149612.0, + "step": 12869 + }, + { + "epoch": 1.6371962854598652, + "grad_norm": 1.523459792137146, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8538708686828613, + "num_tokens": 491193445.0, + "step": 12870 + }, + { + "epoch": 1.6373234957384557, + "grad_norm": 1.446214199066162, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8900796175003052, + "num_tokens": 491232279.0, + "step": 12871 + }, + { + "epoch": 1.6374507060170462, + "grad_norm": 1.6032236814498901, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8768365383148193, + "num_tokens": 491268590.0, + "step": 12872 + }, + { + "epoch": 1.6375779162956365, + "grad_norm": 1.4662213325500488, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8775311708450317, + "num_tokens": 491309717.0, + "step": 12873 + }, + { + "epoch": 1.637705126574227, + "grad_norm": 1.509097933769226, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8755661249160767, + "num_tokens": 491350176.0, + "step": 12874 + }, + { + "epoch": 1.6378323368528176, + "grad_norm": 1.5725380182266235, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.864773154258728, + "num_tokens": 491385632.0, + "step": 12875 + }, + { + "epoch": 1.6379595471314081, + "grad_norm": 1.4631602764129639, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8879606127738953, + "num_tokens": 491423527.0, + "step": 12876 + }, + { + "epoch": 1.6380867574099987, + "grad_norm": 1.628475308418274, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8875393867492676, + "num_tokens": 491452568.0, + "step": 12877 + }, + { + "epoch": 1.6382139676885892, + "grad_norm": 1.5177350044250488, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8826794028282166, + "num_tokens": 491490294.0, + "step": 12878 + }, + { + "epoch": 1.6383411779671797, + "grad_norm": 1.5489344596862793, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8719598054885864, + "num_tokens": 491529645.0, + "step": 12879 + }, + { + "epoch": 1.6384683882457702, + "grad_norm": 1.5695706605911255, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8745486736297607, + "num_tokens": 491561915.0, + "step": 12880 + }, + { + "epoch": 1.6385955985243608, + "grad_norm": 1.4290449619293213, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8718912601470947, + "num_tokens": 491602897.0, + "step": 12881 + }, + { + "epoch": 1.6387228088029513, + "grad_norm": 1.5261473655700684, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.878879964351654, + "num_tokens": 491642128.0, + "step": 12882 + }, + { + "epoch": 1.6388500190815418, + "grad_norm": 1.4497524499893188, + "learning_rate": 1e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.8963409662246704, + "num_tokens": 491680093.0, + "step": 12883 + }, + { + "epoch": 1.6389772293601323, + "grad_norm": 1.6302506923675537, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8799430131912231, + "num_tokens": 491713503.0, + "step": 12884 + }, + { + "epoch": 1.6391044396387229, + "grad_norm": 1.442445158958435, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.867035448551178, + "num_tokens": 491756047.0, + "step": 12885 + }, + { + "epoch": 1.6392316499173134, + "grad_norm": 1.4128994941711426, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8773672580718994, + "num_tokens": 491800904.0, + "step": 12886 + }, + { + "epoch": 1.639358860195904, + "grad_norm": 1.5763721466064453, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8629132509231567, + "num_tokens": 491836520.0, + "step": 12887 + }, + { + "epoch": 1.6394860704744945, + "grad_norm": 1.6825131177902222, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8621774911880493, + "num_tokens": 491871440.0, + "step": 12888 + }, + { + "epoch": 1.639613280753085, + "grad_norm": 1.5420746803283691, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8852587342262268, + "num_tokens": 491908874.0, + "step": 12889 + }, + { + "epoch": 1.6397404910316755, + "grad_norm": 1.4529905319213867, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8907078504562378, + "num_tokens": 491945396.0, + "step": 12890 + }, + { + "epoch": 1.6398677013102658, + "grad_norm": 1.5435922145843506, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8874784708023071, + "num_tokens": 491976944.0, + "step": 12891 + }, + { + "epoch": 1.6399949115888564, + "grad_norm": 1.3730318546295166, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.889319896697998, + "num_tokens": 492018037.0, + "step": 12892 + }, + { + "epoch": 1.6401221218674469, + "grad_norm": 1.404687523841858, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8712716102600098, + "num_tokens": 492065550.0, + "step": 12893 + }, + { + "epoch": 1.6402493321460374, + "grad_norm": 1.5570002794265747, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8790410161018372, + "num_tokens": 492103496.0, + "step": 12894 + }, + { + "epoch": 1.640376542424628, + "grad_norm": 1.561659574508667, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8780913352966309, + "num_tokens": 492137752.0, + "step": 12895 + }, + { + "epoch": 1.6405037527032185, + "grad_norm": 1.3282839059829712, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8780601024627686, + "num_tokens": 492181607.0, + "step": 12896 + }, + { + "epoch": 1.6406309629818088, + "grad_norm": 1.3860102891921997, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8877671957015991, + "num_tokens": 492222307.0, + "step": 12897 + }, + { + "epoch": 1.6407581732603993, + "grad_norm": 1.584976315498352, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8898471593856812, + "num_tokens": 492254915.0, + "step": 12898 + }, + { + "epoch": 1.6408853835389898, + "grad_norm": 1.3585087060928345, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8959448933601379, + "num_tokens": 492296622.0, + "step": 12899 + }, + { + "epoch": 1.6410125938175804, + "grad_norm": 1.4708954095840454, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8778323531150818, + "num_tokens": 492338937.0, + "step": 12900 + }, + { + "epoch": 1.6411398040961709, + "grad_norm": 1.6541568040847778, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8604451417922974, + "num_tokens": 492371845.0, + "step": 12901 + }, + { + "epoch": 1.6412670143747614, + "grad_norm": 1.576430082321167, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8826914429664612, + "num_tokens": 492405143.0, + "step": 12902 + }, + { + "epoch": 1.641394224653352, + "grad_norm": 1.5406625270843506, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8739466667175293, + "num_tokens": 492441491.0, + "step": 12903 + }, + { + "epoch": 1.6415214349319425, + "grad_norm": 1.4024163484573364, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8801364898681641, + "num_tokens": 492481632.0, + "step": 12904 + }, + { + "epoch": 1.641648645210533, + "grad_norm": 1.5012176036834717, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8807620406150818, + "num_tokens": 492519264.0, + "step": 12905 + }, + { + "epoch": 1.6417758554891235, + "grad_norm": 1.5363457202911377, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.880209743976593, + "num_tokens": 492554662.0, + "step": 12906 + }, + { + "epoch": 1.641903065767714, + "grad_norm": 1.3484604358673096, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8846686482429504, + "num_tokens": 492601779.0, + "step": 12907 + }, + { + "epoch": 1.6420302760463046, + "grad_norm": 1.5050089359283447, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8844188451766968, + "num_tokens": 492636559.0, + "step": 12908 + }, + { + "epoch": 1.642157486324895, + "grad_norm": 1.4321050643920898, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8880034685134888, + "num_tokens": 492674832.0, + "step": 12909 + }, + { + "epoch": 1.6422846966034856, + "grad_norm": 1.5701267719268799, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8766355514526367, + "num_tokens": 492713839.0, + "step": 12910 + }, + { + "epoch": 1.6424119068820762, + "grad_norm": 1.5155389308929443, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8823164701461792, + "num_tokens": 492747570.0, + "step": 12911 + }, + { + "epoch": 1.6425391171606667, + "grad_norm": 1.527713418006897, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8761295676231384, + "num_tokens": 492786141.0, + "step": 12912 + }, + { + "epoch": 1.6426663274392572, + "grad_norm": 1.4827675819396973, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8569599390029907, + "num_tokens": 492827570.0, + "step": 12913 + }, + { + "epoch": 1.6427935377178478, + "grad_norm": 1.4394807815551758, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8695064187049866, + "num_tokens": 492871316.0, + "step": 12914 + }, + { + "epoch": 1.642920747996438, + "grad_norm": 1.5421992540359497, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8693106174468994, + "num_tokens": 492910781.0, + "step": 12915 + }, + { + "epoch": 1.6430479582750286, + "grad_norm": 1.451995611190796, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8808261156082153, + "num_tokens": 492949462.0, + "step": 12916 + }, + { + "epoch": 1.6431751685536191, + "grad_norm": 1.4110568761825562, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8795639872550964, + "num_tokens": 492988753.0, + "step": 12917 + }, + { + "epoch": 1.6433023788322096, + "grad_norm": 1.4090499877929688, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8847579956054688, + "num_tokens": 493030294.0, + "step": 12918 + }, + { + "epoch": 1.6434295891108002, + "grad_norm": 1.508236289024353, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8788127899169922, + "num_tokens": 493066221.0, + "step": 12919 + }, + { + "epoch": 1.6435567993893907, + "grad_norm": 1.391943097114563, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8805679082870483, + "num_tokens": 493107760.0, + "step": 12920 + }, + { + "epoch": 1.6436840096679812, + "grad_norm": 1.432474970817566, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8958547115325928, + "num_tokens": 493147569.0, + "step": 12921 + }, + { + "epoch": 1.6438112199465715, + "grad_norm": 1.4598705768585205, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8829584121704102, + "num_tokens": 493185940.0, + "step": 12922 + }, + { + "epoch": 1.643938430225162, + "grad_norm": 1.6013553142547607, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8821568489074707, + "num_tokens": 493224598.0, + "step": 12923 + }, + { + "epoch": 1.6440656405037526, + "grad_norm": 1.5845390558242798, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8614946603775024, + "num_tokens": 493265599.0, + "step": 12924 + }, + { + "epoch": 1.6441928507823431, + "grad_norm": 1.5651288032531738, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8564491271972656, + "num_tokens": 493300967.0, + "step": 12925 + }, + { + "epoch": 1.6443200610609336, + "grad_norm": 1.3704601526260376, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8821749091148376, + "num_tokens": 493344963.0, + "step": 12926 + }, + { + "epoch": 1.6444472713395242, + "grad_norm": 1.5173476934432983, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8892260193824768, + "num_tokens": 493383632.0, + "step": 12927 + }, + { + "epoch": 1.6445744816181147, + "grad_norm": 1.5213795900344849, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8891116380691528, + "num_tokens": 493420570.0, + "step": 12928 + }, + { + "epoch": 1.6447016918967052, + "grad_norm": 1.570311188697815, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8698011040687561, + "num_tokens": 493461580.0, + "step": 12929 + }, + { + "epoch": 1.6448289021752958, + "grad_norm": 1.3393131494522095, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8788610696792603, + "num_tokens": 493504751.0, + "step": 12930 + }, + { + "epoch": 1.6449561124538863, + "grad_norm": 1.7220488786697388, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8644592761993408, + "num_tokens": 493537274.0, + "step": 12931 + }, + { + "epoch": 1.6450833227324768, + "grad_norm": 1.5110461711883545, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8847146034240723, + "num_tokens": 493573929.0, + "step": 12932 + }, + { + "epoch": 1.6452105330110673, + "grad_norm": 1.660820484161377, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8675541281700134, + "num_tokens": 493606385.0, + "step": 12933 + }, + { + "epoch": 1.6453377432896579, + "grad_norm": 1.5455378293991089, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8775728344917297, + "num_tokens": 493643240.0, + "step": 12934 + }, + { + "epoch": 1.6454649535682484, + "grad_norm": 1.4467675685882568, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8873840570449829, + "num_tokens": 493680462.0, + "step": 12935 + }, + { + "epoch": 1.645592163846839, + "grad_norm": 1.4539481401443481, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.893843412399292, + "num_tokens": 493714417.0, + "step": 12936 + }, + { + "epoch": 1.6457193741254295, + "grad_norm": 1.5053178071975708, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8768063187599182, + "num_tokens": 493752940.0, + "step": 12937 + }, + { + "epoch": 1.64584658440402, + "grad_norm": 1.48667573928833, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8906227350234985, + "num_tokens": 493789861.0, + "step": 12938 + }, + { + "epoch": 1.6459737946826105, + "grad_norm": 1.4834084510803223, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.88774174451828, + "num_tokens": 493829492.0, + "step": 12939 + }, + { + "epoch": 1.6461010049612008, + "grad_norm": 1.441430687904358, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.892223596572876, + "num_tokens": 493865810.0, + "step": 12940 + }, + { + "epoch": 1.6462282152397913, + "grad_norm": 1.5501892566680908, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.873331606388092, + "num_tokens": 493903523.0, + "step": 12941 + }, + { + "epoch": 1.6463554255183819, + "grad_norm": 1.524796485900879, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8730460405349731, + "num_tokens": 493940219.0, + "step": 12942 + }, + { + "epoch": 1.6464826357969724, + "grad_norm": 1.4858399629592896, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8762742877006531, + "num_tokens": 493982074.0, + "step": 12943 + }, + { + "epoch": 1.646609846075563, + "grad_norm": 1.5139250755310059, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8801878690719604, + "num_tokens": 494020280.0, + "step": 12944 + }, + { + "epoch": 1.6467370563541535, + "grad_norm": 1.5484282970428467, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.87607741355896, + "num_tokens": 494061528.0, + "step": 12945 + }, + { + "epoch": 1.6468642666327438, + "grad_norm": 1.4383162260055542, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8835312128067017, + "num_tokens": 494101105.0, + "step": 12946 + }, + { + "epoch": 1.6469914769113343, + "grad_norm": 1.5168315172195435, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8893759846687317, + "num_tokens": 494137759.0, + "step": 12947 + }, + { + "epoch": 1.6471186871899248, + "grad_norm": 1.4076173305511475, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.88278728723526, + "num_tokens": 494177119.0, + "step": 12948 + }, + { + "epoch": 1.6472458974685154, + "grad_norm": 1.8043198585510254, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8548431992530823, + "num_tokens": 494213443.0, + "step": 12949 + }, + { + "epoch": 1.6473731077471059, + "grad_norm": 1.5009772777557373, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.870959460735321, + "num_tokens": 494253383.0, + "step": 12950 + }, + { + "epoch": 1.6475003180256964, + "grad_norm": 1.5391932725906372, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8786605000495911, + "num_tokens": 494290620.0, + "step": 12951 + }, + { + "epoch": 1.647627528304287, + "grad_norm": 1.5109080076217651, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8803830742835999, + "num_tokens": 494330308.0, + "step": 12952 + }, + { + "epoch": 1.6477547385828775, + "grad_norm": 1.634981632232666, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8800280690193176, + "num_tokens": 494362093.0, + "step": 12953 + }, + { + "epoch": 1.647881948861468, + "grad_norm": 1.4863635301589966, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8563361167907715, + "num_tokens": 494407318.0, + "step": 12954 + }, + { + "epoch": 1.6480091591400585, + "grad_norm": 1.5713711977005005, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8684362173080444, + "num_tokens": 494445798.0, + "step": 12955 + }, + { + "epoch": 1.648136369418649, + "grad_norm": 1.4320929050445557, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8928936719894409, + "num_tokens": 494482872.0, + "step": 12956 + }, + { + "epoch": 1.6482635796972396, + "grad_norm": 1.3934705257415771, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8820399045944214, + "num_tokens": 494524245.0, + "step": 12957 + }, + { + "epoch": 1.64839078997583, + "grad_norm": 1.3512297868728638, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8766028881072998, + "num_tokens": 494571270.0, + "step": 12958 + }, + { + "epoch": 1.6485180002544206, + "grad_norm": 1.598352313041687, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8675571084022522, + "num_tokens": 494606451.0, + "step": 12959 + }, + { + "epoch": 1.6486452105330112, + "grad_norm": 1.4243344068527222, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8867504000663757, + "num_tokens": 494646303.0, + "step": 12960 + }, + { + "epoch": 1.6487724208116017, + "grad_norm": 1.5692514181137085, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8669199347496033, + "num_tokens": 494680696.0, + "step": 12961 + }, + { + "epoch": 1.6488996310901922, + "grad_norm": 1.5860670804977417, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8630224466323853, + "num_tokens": 494717992.0, + "step": 12962 + }, + { + "epoch": 1.6490268413687827, + "grad_norm": 1.4199557304382324, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8868675827980042, + "num_tokens": 494757499.0, + "step": 12963 + }, + { + "epoch": 1.649154051647373, + "grad_norm": 1.5646827220916748, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.88838130235672, + "num_tokens": 494792016.0, + "step": 12964 + }, + { + "epoch": 1.6492812619259636, + "grad_norm": 1.510428547859192, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8692574501037598, + "num_tokens": 494829446.0, + "step": 12965 + }, + { + "epoch": 1.649408472204554, + "grad_norm": 1.4473956823349, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8819478750228882, + "num_tokens": 494868073.0, + "step": 12966 + }, + { + "epoch": 1.6495356824831446, + "grad_norm": 1.4917359352111816, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8746880292892456, + "num_tokens": 494909484.0, + "step": 12967 + }, + { + "epoch": 1.6496628927617352, + "grad_norm": 1.5234615802764893, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8713634014129639, + "num_tokens": 494949368.0, + "step": 12968 + }, + { + "epoch": 1.6497901030403257, + "grad_norm": 1.4695172309875488, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8809608817100525, + "num_tokens": 494986241.0, + "step": 12969 + }, + { + "epoch": 1.6499173133189162, + "grad_norm": 1.2999444007873535, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8817082643508911, + "num_tokens": 495032291.0, + "step": 12970 + }, + { + "epoch": 1.6500445235975065, + "grad_norm": 1.5596052408218384, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8866405487060547, + "num_tokens": 495067280.0, + "step": 12971 + }, + { + "epoch": 1.650171733876097, + "grad_norm": 1.5612001419067383, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8861371278762817, + "num_tokens": 495104261.0, + "step": 12972 + }, + { + "epoch": 1.6502989441546876, + "grad_norm": 1.4378196001052856, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8941450119018555, + "num_tokens": 495142133.0, + "step": 12973 + }, + { + "epoch": 1.6504261544332781, + "grad_norm": 1.4958833456039429, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8942168951034546, + "num_tokens": 495178017.0, + "step": 12974 + }, + { + "epoch": 1.6505533647118686, + "grad_norm": 1.5487254858016968, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8561494946479797, + "num_tokens": 495217994.0, + "step": 12975 + }, + { + "epoch": 1.6506805749904592, + "grad_norm": 1.5340452194213867, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8695581555366516, + "num_tokens": 495256261.0, + "step": 12976 + }, + { + "epoch": 1.6508077852690497, + "grad_norm": 1.4332289695739746, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8758460283279419, + "num_tokens": 495299907.0, + "step": 12977 + }, + { + "epoch": 1.6509349955476402, + "grad_norm": 1.6108975410461426, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8949192762374878, + "num_tokens": 495329167.0, + "step": 12978 + }, + { + "epoch": 1.6510622058262308, + "grad_norm": 1.555773138999939, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8738056421279907, + "num_tokens": 495366534.0, + "step": 12979 + }, + { + "epoch": 1.6511894161048213, + "grad_norm": 1.5156227350234985, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8757020831108093, + "num_tokens": 495403714.0, + "step": 12980 + }, + { + "epoch": 1.6513166263834118, + "grad_norm": 1.3940114974975586, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8951423168182373, + "num_tokens": 495444103.0, + "step": 12981 + }, + { + "epoch": 1.6514438366620023, + "grad_norm": 1.5864901542663574, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8764599561691284, + "num_tokens": 495480407.0, + "step": 12982 + }, + { + "epoch": 1.6515710469405929, + "grad_norm": 1.4664239883422852, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8789010047912598, + "num_tokens": 495519462.0, + "step": 12983 + }, + { + "epoch": 1.6516982572191834, + "grad_norm": 1.4370040893554688, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8879022002220154, + "num_tokens": 495559893.0, + "step": 12984 + }, + { + "epoch": 1.651825467497774, + "grad_norm": 1.476463794708252, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8746651411056519, + "num_tokens": 495601948.0, + "step": 12985 + }, + { + "epoch": 1.6519526777763645, + "grad_norm": 1.5223745107650757, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8754380941390991, + "num_tokens": 495640222.0, + "step": 12986 + }, + { + "epoch": 1.652079888054955, + "grad_norm": 1.4343522787094116, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8841728568077087, + "num_tokens": 495680013.0, + "step": 12987 + }, + { + "epoch": 1.6522070983335455, + "grad_norm": 1.5113050937652588, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8734983205795288, + "num_tokens": 495717606.0, + "step": 12988 + }, + { + "epoch": 1.6523343086121358, + "grad_norm": 1.470863699913025, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8856703042984009, + "num_tokens": 495753336.0, + "step": 12989 + }, + { + "epoch": 1.6524615188907263, + "grad_norm": 1.4271292686462402, + "learning_rate": 1e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.8984555006027222, + "num_tokens": 495791013.0, + "step": 12990 + }, + { + "epoch": 1.6525887291693169, + "grad_norm": 1.540254831314087, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8788193464279175, + "num_tokens": 495828018.0, + "step": 12991 + }, + { + "epoch": 1.6527159394479074, + "grad_norm": 1.556970477104187, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8745784759521484, + "num_tokens": 495866044.0, + "step": 12992 + }, + { + "epoch": 1.652843149726498, + "grad_norm": 1.5586326122283936, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8589284420013428, + "num_tokens": 495903217.0, + "step": 12993 + }, + { + "epoch": 1.6529703600050885, + "grad_norm": 1.4167449474334717, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8772005438804626, + "num_tokens": 495942163.0, + "step": 12994 + }, + { + "epoch": 1.6530975702836788, + "grad_norm": 1.4285714626312256, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8788153529167175, + "num_tokens": 495982168.0, + "step": 12995 + }, + { + "epoch": 1.6532247805622693, + "grad_norm": 1.6815447807312012, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.865272045135498, + "num_tokens": 496018343.0, + "step": 12996 + }, + { + "epoch": 1.6533519908408598, + "grad_norm": 1.5860484838485718, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.866104006767273, + "num_tokens": 496056911.0, + "step": 12997 + }, + { + "epoch": 1.6534792011194503, + "grad_norm": 1.4394763708114624, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8730326890945435, + "num_tokens": 496098386.0, + "step": 12998 + }, + { + "epoch": 1.6536064113980409, + "grad_norm": 1.612987756729126, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8596401214599609, + "num_tokens": 496136313.0, + "step": 12999 + }, + { + "epoch": 1.6537336216766314, + "grad_norm": 1.6010091304779053, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8767373561859131, + "num_tokens": 496171512.0, + "step": 13000 + }, + { + "epoch": 1.653860831955222, + "grad_norm": 1.5161150693893433, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8764601349830627, + "num_tokens": 496208642.0, + "step": 13001 + }, + { + "epoch": 1.6539880422338125, + "grad_norm": 1.4964525699615479, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8802729249000549, + "num_tokens": 496246328.0, + "step": 13002 + }, + { + "epoch": 1.654115252512403, + "grad_norm": 1.4553152322769165, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8766472935676575, + "num_tokens": 496285367.0, + "step": 13003 + }, + { + "epoch": 1.6542424627909935, + "grad_norm": 1.5532702207565308, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8723330497741699, + "num_tokens": 496328932.0, + "step": 13004 + }, + { + "epoch": 1.654369673069584, + "grad_norm": 1.4896180629730225, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8873450756072998, + "num_tokens": 496362879.0, + "step": 13005 + }, + { + "epoch": 1.6544968833481746, + "grad_norm": 1.5428396463394165, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8747862577438354, + "num_tokens": 496397592.0, + "step": 13006 + }, + { + "epoch": 1.654624093626765, + "grad_norm": 1.3790559768676758, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8659971952438354, + "num_tokens": 496444474.0, + "step": 13007 + }, + { + "epoch": 1.6547513039053556, + "grad_norm": 1.4563897848129272, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8675780296325684, + "num_tokens": 496485261.0, + "step": 13008 + }, + { + "epoch": 1.6548785141839462, + "grad_norm": 1.5234943628311157, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8646236658096313, + "num_tokens": 496523607.0, + "step": 13009 + }, + { + "epoch": 1.6550057244625367, + "grad_norm": 1.620240330696106, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8640873432159424, + "num_tokens": 496556207.0, + "step": 13010 + }, + { + "epoch": 1.6551329347411272, + "grad_norm": 1.4978656768798828, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.880462646484375, + "num_tokens": 496594567.0, + "step": 13011 + }, + { + "epoch": 1.6552601450197177, + "grad_norm": 1.4304531812667847, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8833703398704529, + "num_tokens": 496633771.0, + "step": 13012 + }, + { + "epoch": 1.655387355298308, + "grad_norm": 1.595536231994629, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8818614482879639, + "num_tokens": 496663493.0, + "step": 13013 + }, + { + "epoch": 1.6555145655768986, + "grad_norm": 1.359267234802246, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8809846639633179, + "num_tokens": 496704637.0, + "step": 13014 + }, + { + "epoch": 1.655641775855489, + "grad_norm": 1.4794261455535889, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8722845315933228, + "num_tokens": 496743169.0, + "step": 13015 + }, + { + "epoch": 1.6557689861340796, + "grad_norm": 1.4834789037704468, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8751475214958191, + "num_tokens": 496783664.0, + "step": 13016 + }, + { + "epoch": 1.6558961964126702, + "grad_norm": 1.4384074211120605, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8685893416404724, + "num_tokens": 496825977.0, + "step": 13017 + }, + { + "epoch": 1.6560234066912607, + "grad_norm": 1.4837298393249512, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8771060705184937, + "num_tokens": 496866739.0, + "step": 13018 + }, + { + "epoch": 1.6561506169698512, + "grad_norm": 1.5044931173324585, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8898212909698486, + "num_tokens": 496902524.0, + "step": 13019 + }, + { + "epoch": 1.6562778272484415, + "grad_norm": 1.515439748764038, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8812685608863831, + "num_tokens": 496939989.0, + "step": 13020 + }, + { + "epoch": 1.656405037527032, + "grad_norm": 1.6768420934677124, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8745129108428955, + "num_tokens": 496973301.0, + "step": 13021 + }, + { + "epoch": 1.6565322478056226, + "grad_norm": 1.4179238080978394, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8842107057571411, + "num_tokens": 497013154.0, + "step": 13022 + }, + { + "epoch": 1.656659458084213, + "grad_norm": 1.5182660818099976, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8690469861030579, + "num_tokens": 497052417.0, + "step": 13023 + }, + { + "epoch": 1.6567866683628036, + "grad_norm": 1.4980305433273315, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8654164671897888, + "num_tokens": 497096001.0, + "step": 13024 + }, + { + "epoch": 1.6569138786413942, + "grad_norm": 1.6249773502349854, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8790684342384338, + "num_tokens": 497131354.0, + "step": 13025 + }, + { + "epoch": 1.6570410889199847, + "grad_norm": 1.5445407629013062, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8608647584915161, + "num_tokens": 497167764.0, + "step": 13026 + }, + { + "epoch": 1.6571682991985752, + "grad_norm": 1.4347163438796997, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8814390301704407, + "num_tokens": 497206143.0, + "step": 13027 + }, + { + "epoch": 1.6572955094771658, + "grad_norm": 1.5772393941879272, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8748949766159058, + "num_tokens": 497239360.0, + "step": 13028 + }, + { + "epoch": 1.6574227197557563, + "grad_norm": 1.423641562461853, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8800740242004395, + "num_tokens": 497278274.0, + "step": 13029 + }, + { + "epoch": 1.6575499300343468, + "grad_norm": 1.7516146898269653, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8767510652542114, + "num_tokens": 497307882.0, + "step": 13030 + }, + { + "epoch": 1.6576771403129373, + "grad_norm": 1.5312824249267578, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8961288332939148, + "num_tokens": 497340759.0, + "step": 13031 + }, + { + "epoch": 1.6578043505915279, + "grad_norm": 1.4516921043395996, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8912491798400879, + "num_tokens": 497381321.0, + "step": 13032 + }, + { + "epoch": 1.6579315608701184, + "grad_norm": 1.487217903137207, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.867194652557373, + "num_tokens": 497423034.0, + "step": 13033 + }, + { + "epoch": 1.658058771148709, + "grad_norm": 1.4883784055709839, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8745123147964478, + "num_tokens": 497461972.0, + "step": 13034 + }, + { + "epoch": 1.6581859814272994, + "grad_norm": 1.5323421955108643, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8760578632354736, + "num_tokens": 497505277.0, + "step": 13035 + }, + { + "epoch": 1.65831319170589, + "grad_norm": 1.5046874284744263, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8855178356170654, + "num_tokens": 497542028.0, + "step": 13036 + }, + { + "epoch": 1.6584404019844805, + "grad_norm": 1.7139533758163452, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8575764894485474, + "num_tokens": 497578194.0, + "step": 13037 + }, + { + "epoch": 1.6585676122630708, + "grad_norm": 1.4154491424560547, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.878768265247345, + "num_tokens": 497617849.0, + "step": 13038 + }, + { + "epoch": 1.6586948225416613, + "grad_norm": 1.4691466093063354, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8798909783363342, + "num_tokens": 497657936.0, + "step": 13039 + }, + { + "epoch": 1.6588220328202519, + "grad_norm": 1.420199990272522, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8841795921325684, + "num_tokens": 497697409.0, + "step": 13040 + }, + { + "epoch": 1.6589492430988424, + "grad_norm": 1.4976263046264648, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8641281723976135, + "num_tokens": 497739374.0, + "step": 13041 + }, + { + "epoch": 1.659076453377433, + "grad_norm": 1.435137391090393, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8776668906211853, + "num_tokens": 497779521.0, + "step": 13042 + }, + { + "epoch": 1.6592036636560235, + "grad_norm": 1.5352071523666382, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.862217366695404, + "num_tokens": 497821649.0, + "step": 13043 + }, + { + "epoch": 1.6593308739346138, + "grad_norm": 1.591356873512268, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8708149790763855, + "num_tokens": 497859216.0, + "step": 13044 + }, + { + "epoch": 1.6594580842132043, + "grad_norm": 1.544639229774475, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8752002716064453, + "num_tokens": 497893126.0, + "step": 13045 + }, + { + "epoch": 1.6595852944917948, + "grad_norm": 1.4407328367233276, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.881056547164917, + "num_tokens": 497930391.0, + "step": 13046 + }, + { + "epoch": 1.6597125047703853, + "grad_norm": 1.5958654880523682, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8599264621734619, + "num_tokens": 497967974.0, + "step": 13047 + }, + { + "epoch": 1.6598397150489759, + "grad_norm": 6.078507423400879, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8651597499847412, + "num_tokens": 498006063.0, + "step": 13048 + }, + { + "epoch": 1.6599669253275664, + "grad_norm": 1.5269737243652344, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8814953565597534, + "num_tokens": 498041742.0, + "step": 13049 + }, + { + "epoch": 1.660094135606157, + "grad_norm": 1.6571762561798096, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8629521131515503, + "num_tokens": 498077599.0, + "step": 13050 + }, + { + "epoch": 1.6602213458847475, + "grad_norm": 1.4339385032653809, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8656193017959595, + "num_tokens": 498123556.0, + "step": 13051 + }, + { + "epoch": 1.660348556163338, + "grad_norm": 1.523583173751831, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8899859189987183, + "num_tokens": 498156921.0, + "step": 13052 + }, + { + "epoch": 1.6604757664419285, + "grad_norm": 1.388157606124878, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.892385721206665, + "num_tokens": 498194601.0, + "step": 13053 + }, + { + "epoch": 1.660602976720519, + "grad_norm": 1.3418155908584595, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8868950605392456, + "num_tokens": 498240107.0, + "step": 13054 + }, + { + "epoch": 1.6607301869991096, + "grad_norm": 1.3348478078842163, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8917570114135742, + "num_tokens": 498279813.0, + "step": 13055 + }, + { + "epoch": 1.6608573972777, + "grad_norm": 1.5481188297271729, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8726961016654968, + "num_tokens": 498315999.0, + "step": 13056 + }, + { + "epoch": 1.6609846075562906, + "grad_norm": 1.4233922958374023, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8839824795722961, + "num_tokens": 498357177.0, + "step": 13057 + }, + { + "epoch": 1.6611118178348812, + "grad_norm": 1.5770397186279297, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8792911767959595, + "num_tokens": 498389221.0, + "step": 13058 + }, + { + "epoch": 1.6612390281134717, + "grad_norm": 1.4703128337860107, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.894072413444519, + "num_tokens": 498424250.0, + "step": 13059 + }, + { + "epoch": 1.6613662383920622, + "grad_norm": 1.6363627910614014, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8738709688186646, + "num_tokens": 498461588.0, + "step": 13060 + }, + { + "epoch": 1.6614934486706527, + "grad_norm": 1.4476368427276611, + "learning_rate": 1e-06, + "loss": 0.2767, + "mean_token_accuracy": 0.8999422788619995, + "num_tokens": 498495579.0, + "step": 13061 + }, + { + "epoch": 1.661620658949243, + "grad_norm": 1.403221845626831, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8763443827629089, + "num_tokens": 498537092.0, + "step": 13062 + }, + { + "epoch": 1.6617478692278336, + "grad_norm": 1.495492696762085, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8837364315986633, + "num_tokens": 498573992.0, + "step": 13063 + }, + { + "epoch": 1.661875079506424, + "grad_norm": 1.5726237297058105, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.878790557384491, + "num_tokens": 498610336.0, + "step": 13064 + }, + { + "epoch": 1.6620022897850146, + "grad_norm": 1.5689562559127808, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8731802105903625, + "num_tokens": 498646775.0, + "step": 13065 + }, + { + "epoch": 1.6621295000636052, + "grad_norm": 1.6495860815048218, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.868692934513092, + "num_tokens": 498687403.0, + "step": 13066 + }, + { + "epoch": 1.6622567103421957, + "grad_norm": 1.3942745923995972, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8863224983215332, + "num_tokens": 498727772.0, + "step": 13067 + }, + { + "epoch": 1.662383920620786, + "grad_norm": 1.49341881275177, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8661569356918335, + "num_tokens": 498771483.0, + "step": 13068 + }, + { + "epoch": 1.6625111308993765, + "grad_norm": 1.3562208414077759, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8815470337867737, + "num_tokens": 498814798.0, + "step": 13069 + }, + { + "epoch": 1.662638341177967, + "grad_norm": 1.4486175775527954, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8849120140075684, + "num_tokens": 498851743.0, + "step": 13070 + }, + { + "epoch": 1.6627655514565576, + "grad_norm": 1.4984064102172852, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8740346431732178, + "num_tokens": 498887251.0, + "step": 13071 + }, + { + "epoch": 1.662892761735148, + "grad_norm": 1.5108747482299805, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8750538229942322, + "num_tokens": 498924111.0, + "step": 13072 + }, + { + "epoch": 1.6630199720137386, + "grad_norm": 1.4966901540756226, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8747555017471313, + "num_tokens": 498964510.0, + "step": 13073 + }, + { + "epoch": 1.6631471822923292, + "grad_norm": 1.4081002473831177, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8773879408836365, + "num_tokens": 499006093.0, + "step": 13074 + }, + { + "epoch": 1.6632743925709197, + "grad_norm": 1.5777947902679443, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8655703067779541, + "num_tokens": 499043158.0, + "step": 13075 + }, + { + "epoch": 1.6634016028495102, + "grad_norm": 1.5738321542739868, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8828521370887756, + "num_tokens": 499077800.0, + "step": 13076 + }, + { + "epoch": 1.6635288131281007, + "grad_norm": 1.3443578481674194, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.886096715927124, + "num_tokens": 499121122.0, + "step": 13077 + }, + { + "epoch": 1.6636560234066913, + "grad_norm": 1.4561288356781006, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8811357617378235, + "num_tokens": 499159644.0, + "step": 13078 + }, + { + "epoch": 1.6637832336852818, + "grad_norm": 1.5320618152618408, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8872450590133667, + "num_tokens": 499197282.0, + "step": 13079 + }, + { + "epoch": 1.6639104439638723, + "grad_norm": 1.50131094455719, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8745232820510864, + "num_tokens": 499237395.0, + "step": 13080 + }, + { + "epoch": 1.6640376542424629, + "grad_norm": 1.4754462242126465, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8736878633499146, + "num_tokens": 499278337.0, + "step": 13081 + }, + { + "epoch": 1.6641648645210534, + "grad_norm": 1.5218546390533447, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8783172369003296, + "num_tokens": 499314122.0, + "step": 13082 + }, + { + "epoch": 1.664292074799644, + "grad_norm": 1.5021992921829224, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.866088330745697, + "num_tokens": 499358820.0, + "step": 13083 + }, + { + "epoch": 1.6644192850782344, + "grad_norm": 1.3103293180465698, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8853071928024292, + "num_tokens": 499404143.0, + "step": 13084 + }, + { + "epoch": 1.664546495356825, + "grad_norm": 1.6115202903747559, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8757523894309998, + "num_tokens": 499436665.0, + "step": 13085 + }, + { + "epoch": 1.6646737056354155, + "grad_norm": 1.3811129331588745, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8750810027122498, + "num_tokens": 499479212.0, + "step": 13086 + }, + { + "epoch": 1.6648009159140058, + "grad_norm": 1.4377630949020386, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8733869791030884, + "num_tokens": 499519608.0, + "step": 13087 + }, + { + "epoch": 1.6649281261925963, + "grad_norm": 1.619740605354309, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.876517653465271, + "num_tokens": 499553464.0, + "step": 13088 + }, + { + "epoch": 1.6650553364711869, + "grad_norm": 1.644291877746582, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.869351863861084, + "num_tokens": 499588884.0, + "step": 13089 + }, + { + "epoch": 1.6651825467497774, + "grad_norm": 1.5792109966278076, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8657374382019043, + "num_tokens": 499627945.0, + "step": 13090 + }, + { + "epoch": 1.665309757028368, + "grad_norm": 1.6177706718444824, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8795303106307983, + "num_tokens": 499663226.0, + "step": 13091 + }, + { + "epoch": 1.6654369673069584, + "grad_norm": 1.444800615310669, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8724188804626465, + "num_tokens": 499705858.0, + "step": 13092 + }, + { + "epoch": 1.6655641775855488, + "grad_norm": 1.4527066946029663, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8646817207336426, + "num_tokens": 499748493.0, + "step": 13093 + }, + { + "epoch": 1.6656913878641393, + "grad_norm": 1.44390070438385, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8875832557678223, + "num_tokens": 499791262.0, + "step": 13094 + }, + { + "epoch": 1.6658185981427298, + "grad_norm": 1.5429545640945435, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.860040009021759, + "num_tokens": 499829393.0, + "step": 13095 + }, + { + "epoch": 1.6659458084213203, + "grad_norm": 1.5393409729003906, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8700821399688721, + "num_tokens": 499869415.0, + "step": 13096 + }, + { + "epoch": 1.6660730186999109, + "grad_norm": 1.534027338027954, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8732209801673889, + "num_tokens": 499908865.0, + "step": 13097 + }, + { + "epoch": 1.6662002289785014, + "grad_norm": 1.4907925128936768, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8832249641418457, + "num_tokens": 499945963.0, + "step": 13098 + }, + { + "epoch": 1.666327439257092, + "grad_norm": 1.6411285400390625, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8581262230873108, + "num_tokens": 499981476.0, + "step": 13099 + }, + { + "epoch": 1.6664546495356825, + "grad_norm": 1.4361213445663452, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8731478452682495, + "num_tokens": 500022726.0, + "step": 13100 + }, + { + "epoch": 1.666581859814273, + "grad_norm": 1.5087717771530151, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8742013573646545, + "num_tokens": 500064030.0, + "step": 13101 + }, + { + "epoch": 1.6667090700928635, + "grad_norm": 1.3064894676208496, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8915284276008606, + "num_tokens": 500103751.0, + "step": 13102 + }, + { + "epoch": 1.666836280371454, + "grad_norm": 1.5530791282653809, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8858633041381836, + "num_tokens": 500137649.0, + "step": 13103 + }, + { + "epoch": 1.6669634906500446, + "grad_norm": 1.3902901411056519, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8834488987922668, + "num_tokens": 500180071.0, + "step": 13104 + }, + { + "epoch": 1.667090700928635, + "grad_norm": 1.6460899114608765, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8632636666297913, + "num_tokens": 500214381.0, + "step": 13105 + }, + { + "epoch": 1.6672179112072256, + "grad_norm": 1.5301789045333862, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8808249235153198, + "num_tokens": 500248815.0, + "step": 13106 + }, + { + "epoch": 1.6673451214858162, + "grad_norm": 1.4149811267852783, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8907917737960815, + "num_tokens": 500290022.0, + "step": 13107 + }, + { + "epoch": 1.6674723317644067, + "grad_norm": 1.6372337341308594, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.886499285697937, + "num_tokens": 500319299.0, + "step": 13108 + }, + { + "epoch": 1.6675995420429972, + "grad_norm": 1.5529338121414185, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8761627674102783, + "num_tokens": 500355114.0, + "step": 13109 + }, + { + "epoch": 1.6677267523215877, + "grad_norm": 1.6045269966125488, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8867292404174805, + "num_tokens": 500392774.0, + "step": 13110 + }, + { + "epoch": 1.667853962600178, + "grad_norm": 1.452709436416626, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8867411017417908, + "num_tokens": 500432320.0, + "step": 13111 + }, + { + "epoch": 1.6679811728787686, + "grad_norm": 1.5025030374526978, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8758000135421753, + "num_tokens": 500470509.0, + "step": 13112 + }, + { + "epoch": 1.668108383157359, + "grad_norm": 1.493165373802185, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.884108304977417, + "num_tokens": 500507500.0, + "step": 13113 + }, + { + "epoch": 1.6682355934359496, + "grad_norm": 1.333918809890747, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8755092620849609, + "num_tokens": 500553867.0, + "step": 13114 + }, + { + "epoch": 1.6683628037145402, + "grad_norm": 1.552994966506958, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8601194620132446, + "num_tokens": 500590057.0, + "step": 13115 + }, + { + "epoch": 1.6684900139931307, + "grad_norm": 1.399944543838501, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8887865543365479, + "num_tokens": 500627941.0, + "step": 13116 + }, + { + "epoch": 1.668617224271721, + "grad_norm": 1.5237802267074585, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8755582571029663, + "num_tokens": 500666071.0, + "step": 13117 + }, + { + "epoch": 1.6687444345503115, + "grad_norm": 1.4724643230438232, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8844900131225586, + "num_tokens": 500701338.0, + "step": 13118 + }, + { + "epoch": 1.668871644828902, + "grad_norm": 1.4022961854934692, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8713204264640808, + "num_tokens": 500744897.0, + "step": 13119 + }, + { + "epoch": 1.6689988551074926, + "grad_norm": 1.4947686195373535, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8810575008392334, + "num_tokens": 500784048.0, + "step": 13120 + }, + { + "epoch": 1.669126065386083, + "grad_norm": 1.6851531267166138, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.874153733253479, + "num_tokens": 500817214.0, + "step": 13121 + }, + { + "epoch": 1.6692532756646736, + "grad_norm": 1.4473010301589966, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8706346750259399, + "num_tokens": 500856618.0, + "step": 13122 + }, + { + "epoch": 1.6693804859432642, + "grad_norm": 1.492390751838684, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8761118650436401, + "num_tokens": 500891270.0, + "step": 13123 + }, + { + "epoch": 1.6695076962218547, + "grad_norm": 1.481575608253479, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8707932829856873, + "num_tokens": 500931246.0, + "step": 13124 + }, + { + "epoch": 1.6696349065004452, + "grad_norm": 1.3945337533950806, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.888046145439148, + "num_tokens": 500970281.0, + "step": 13125 + }, + { + "epoch": 1.6697621167790357, + "grad_norm": 1.6927332878112793, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8559719324111938, + "num_tokens": 501014948.0, + "step": 13126 + }, + { + "epoch": 1.6698893270576263, + "grad_norm": 1.439804196357727, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8749526143074036, + "num_tokens": 501056361.0, + "step": 13127 + }, + { + "epoch": 1.6700165373362168, + "grad_norm": 1.4991655349731445, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8809628486633301, + "num_tokens": 501093908.0, + "step": 13128 + }, + { + "epoch": 1.6701437476148073, + "grad_norm": 1.5683655738830566, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.880095362663269, + "num_tokens": 501130692.0, + "step": 13129 + }, + { + "epoch": 1.6702709578933979, + "grad_norm": 1.5666961669921875, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8642964363098145, + "num_tokens": 501165635.0, + "step": 13130 + }, + { + "epoch": 1.6703981681719884, + "grad_norm": 1.6440722942352295, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8537369966506958, + "num_tokens": 501200903.0, + "step": 13131 + }, + { + "epoch": 1.670525378450579, + "grad_norm": 1.5331335067749023, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8822901844978333, + "num_tokens": 501235288.0, + "step": 13132 + }, + { + "epoch": 1.6706525887291694, + "grad_norm": 1.4263463020324707, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8818286061286926, + "num_tokens": 501274211.0, + "step": 13133 + }, + { + "epoch": 1.67077979900776, + "grad_norm": 1.478934645652771, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8791214823722839, + "num_tokens": 501312270.0, + "step": 13134 + }, + { + "epoch": 1.6709070092863505, + "grad_norm": 1.510638952255249, + "learning_rate": 1e-06, + "loss": 0.2719, + "mean_token_accuracy": 0.9020035266876221, + "num_tokens": 501344330.0, + "step": 13135 + }, + { + "epoch": 1.6710342195649408, + "grad_norm": 1.520129919052124, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8750086426734924, + "num_tokens": 501380296.0, + "step": 13136 + }, + { + "epoch": 1.6711614298435313, + "grad_norm": 1.430905818939209, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8872597813606262, + "num_tokens": 501418841.0, + "step": 13137 + }, + { + "epoch": 1.6712886401221219, + "grad_norm": 1.4231960773468018, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.881806492805481, + "num_tokens": 501461998.0, + "step": 13138 + }, + { + "epoch": 1.6714158504007124, + "grad_norm": 1.4795182943344116, + "learning_rate": 1e-06, + "loss": 0.287, + "mean_token_accuracy": 0.8968111872673035, + "num_tokens": 501499012.0, + "step": 13139 + }, + { + "epoch": 1.671543060679303, + "grad_norm": 1.6590014696121216, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8683832287788391, + "num_tokens": 501530734.0, + "step": 13140 + }, + { + "epoch": 1.6716702709578934, + "grad_norm": 1.5746480226516724, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8689139485359192, + "num_tokens": 501569042.0, + "step": 13141 + }, + { + "epoch": 1.6717974812364838, + "grad_norm": 1.4258430004119873, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8872448801994324, + "num_tokens": 501608056.0, + "step": 13142 + }, + { + "epoch": 1.6719246915150743, + "grad_norm": 1.5387893915176392, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8782535791397095, + "num_tokens": 501641896.0, + "step": 13143 + }, + { + "epoch": 1.6720519017936648, + "grad_norm": 1.566178798675537, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8775166273117065, + "num_tokens": 501683138.0, + "step": 13144 + }, + { + "epoch": 1.6721791120722553, + "grad_norm": 1.5038126707077026, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8746592402458191, + "num_tokens": 501722523.0, + "step": 13145 + }, + { + "epoch": 1.6723063223508459, + "grad_norm": 1.4832887649536133, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8689697980880737, + "num_tokens": 501765624.0, + "step": 13146 + }, + { + "epoch": 1.6724335326294364, + "grad_norm": 1.4155758619308472, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.876228928565979, + "num_tokens": 501806675.0, + "step": 13147 + }, + { + "epoch": 1.672560742908027, + "grad_norm": 1.527611255645752, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8645509481430054, + "num_tokens": 501844992.0, + "step": 13148 + }, + { + "epoch": 1.6726879531866174, + "grad_norm": 1.5732033252716064, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8726199865341187, + "num_tokens": 501883276.0, + "step": 13149 + }, + { + "epoch": 1.672815163465208, + "grad_norm": 1.515731692314148, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8716863393783569, + "num_tokens": 501924760.0, + "step": 13150 + }, + { + "epoch": 1.6729423737437985, + "grad_norm": 1.6027231216430664, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8560628890991211, + "num_tokens": 501961547.0, + "step": 13151 + }, + { + "epoch": 1.673069584022389, + "grad_norm": 1.499751091003418, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8714514374732971, + "num_tokens": 502001533.0, + "step": 13152 + }, + { + "epoch": 1.6731967943009796, + "grad_norm": 1.5206451416015625, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8884215354919434, + "num_tokens": 502036249.0, + "step": 13153 + }, + { + "epoch": 1.67332400457957, + "grad_norm": 1.474219799041748, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8826402425765991, + "num_tokens": 502071739.0, + "step": 13154 + }, + { + "epoch": 1.6734512148581606, + "grad_norm": 1.5766972303390503, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8832060098648071, + "num_tokens": 502105005.0, + "step": 13155 + }, + { + "epoch": 1.6735784251367511, + "grad_norm": 1.5977497100830078, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8787640333175659, + "num_tokens": 502138666.0, + "step": 13156 + }, + { + "epoch": 1.6737056354153417, + "grad_norm": 1.4718742370605469, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8757530450820923, + "num_tokens": 502179455.0, + "step": 13157 + }, + { + "epoch": 1.6738328456939322, + "grad_norm": 1.3905569314956665, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8771858215332031, + "num_tokens": 502220867.0, + "step": 13158 + }, + { + "epoch": 1.6739600559725227, + "grad_norm": 1.386608362197876, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8842885494232178, + "num_tokens": 502261891.0, + "step": 13159 + }, + { + "epoch": 1.674087266251113, + "grad_norm": 1.5018541812896729, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8782404065132141, + "num_tokens": 502299858.0, + "step": 13160 + }, + { + "epoch": 1.6742144765297036, + "grad_norm": 1.4415332078933716, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8899802565574646, + "num_tokens": 502337768.0, + "step": 13161 + }, + { + "epoch": 1.674341686808294, + "grad_norm": 1.484824538230896, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8614771366119385, + "num_tokens": 502378197.0, + "step": 13162 + }, + { + "epoch": 1.6744688970868846, + "grad_norm": 1.4180704355239868, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8839951753616333, + "num_tokens": 502416647.0, + "step": 13163 + }, + { + "epoch": 1.6745961073654752, + "grad_norm": 1.4347878694534302, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8704642057418823, + "num_tokens": 502461068.0, + "step": 13164 + }, + { + "epoch": 1.6747233176440657, + "grad_norm": 1.3297370672225952, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8856099247932434, + "num_tokens": 502505873.0, + "step": 13165 + }, + { + "epoch": 1.674850527922656, + "grad_norm": 1.4071872234344482, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8821170926094055, + "num_tokens": 502546343.0, + "step": 13166 + }, + { + "epoch": 1.6749777382012465, + "grad_norm": 1.3497183322906494, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8859748244285583, + "num_tokens": 502585490.0, + "step": 13167 + }, + { + "epoch": 1.675104948479837, + "grad_norm": 1.5383082628250122, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8713743686676025, + "num_tokens": 502621028.0, + "step": 13168 + }, + { + "epoch": 1.6752321587584276, + "grad_norm": 1.5009119510650635, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.885094404220581, + "num_tokens": 502664301.0, + "step": 13169 + }, + { + "epoch": 1.675359369037018, + "grad_norm": 1.4172232151031494, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8793103098869324, + "num_tokens": 502705258.0, + "step": 13170 + }, + { + "epoch": 1.6754865793156086, + "grad_norm": 1.4533531665802002, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8840435743331909, + "num_tokens": 502741050.0, + "step": 13171 + }, + { + "epoch": 1.6756137895941992, + "grad_norm": 1.3967704772949219, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8767723441123962, + "num_tokens": 502782072.0, + "step": 13172 + }, + { + "epoch": 1.6757409998727897, + "grad_norm": 1.4572535753250122, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8849072456359863, + "num_tokens": 502821737.0, + "step": 13173 + }, + { + "epoch": 1.6758682101513802, + "grad_norm": 1.4704020023345947, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.896761417388916, + "num_tokens": 502859570.0, + "step": 13174 + }, + { + "epoch": 1.6759954204299707, + "grad_norm": 1.5595550537109375, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8886226415634155, + "num_tokens": 502892746.0, + "step": 13175 + }, + { + "epoch": 1.6761226307085613, + "grad_norm": 1.4836030006408691, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8785058856010437, + "num_tokens": 502929450.0, + "step": 13176 + }, + { + "epoch": 1.6762498409871518, + "grad_norm": 1.5221588611602783, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8747134804725647, + "num_tokens": 502967847.0, + "step": 13177 + }, + { + "epoch": 1.6763770512657423, + "grad_norm": 1.4287320375442505, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8857535123825073, + "num_tokens": 503006683.0, + "step": 13178 + }, + { + "epoch": 1.6765042615443329, + "grad_norm": 1.2649177312850952, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8920212984085083, + "num_tokens": 503050712.0, + "step": 13179 + }, + { + "epoch": 1.6766314718229234, + "grad_norm": 1.361386775970459, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8862338662147522, + "num_tokens": 503092754.0, + "step": 13180 + }, + { + "epoch": 1.676758682101514, + "grad_norm": 1.5764847993850708, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8889403343200684, + "num_tokens": 503130230.0, + "step": 13181 + }, + { + "epoch": 1.6768858923801044, + "grad_norm": 1.5219887495040894, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.875584602355957, + "num_tokens": 503169280.0, + "step": 13182 + }, + { + "epoch": 1.677013102658695, + "grad_norm": 1.4958285093307495, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8797094821929932, + "num_tokens": 503207397.0, + "step": 13183 + }, + { + "epoch": 1.6771403129372855, + "grad_norm": 1.3911043405532837, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8859339952468872, + "num_tokens": 503247405.0, + "step": 13184 + }, + { + "epoch": 1.6772675232158758, + "grad_norm": 1.4395779371261597, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8861402869224548, + "num_tokens": 503283584.0, + "step": 13185 + }, + { + "epoch": 1.6773947334944663, + "grad_norm": 1.6549714803695679, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8718340396881104, + "num_tokens": 503322752.0, + "step": 13186 + }, + { + "epoch": 1.6775219437730569, + "grad_norm": 1.7439093589782715, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8656899929046631, + "num_tokens": 503353439.0, + "step": 13187 + }, + { + "epoch": 1.6776491540516474, + "grad_norm": 1.5567054748535156, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.877516508102417, + "num_tokens": 503387294.0, + "step": 13188 + }, + { + "epoch": 1.677776364330238, + "grad_norm": 1.4045900106430054, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8846064805984497, + "num_tokens": 503427297.0, + "step": 13189 + }, + { + "epoch": 1.6779035746088284, + "grad_norm": 1.440346360206604, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8858902454376221, + "num_tokens": 503461231.0, + "step": 13190 + }, + { + "epoch": 1.6780307848874187, + "grad_norm": 1.4550474882125854, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8633089661598206, + "num_tokens": 503501897.0, + "step": 13191 + }, + { + "epoch": 1.6781579951660093, + "grad_norm": 1.5003604888916016, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8792175650596619, + "num_tokens": 503542091.0, + "step": 13192 + }, + { + "epoch": 1.6782852054445998, + "grad_norm": 1.5557966232299805, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8872920274734497, + "num_tokens": 503574644.0, + "step": 13193 + }, + { + "epoch": 1.6784124157231903, + "grad_norm": 1.5358340740203857, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8877238035202026, + "num_tokens": 503611685.0, + "step": 13194 + }, + { + "epoch": 1.6785396260017809, + "grad_norm": 1.473140835762024, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8939570188522339, + "num_tokens": 503649228.0, + "step": 13195 + }, + { + "epoch": 1.6786668362803714, + "grad_norm": 1.4476516246795654, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8748457431793213, + "num_tokens": 503694481.0, + "step": 13196 + }, + { + "epoch": 1.678794046558962, + "grad_norm": 1.5188159942626953, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8823989629745483, + "num_tokens": 503732882.0, + "step": 13197 + }, + { + "epoch": 1.6789212568375524, + "grad_norm": 1.6088982820510864, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8659886717796326, + "num_tokens": 503768499.0, + "step": 13198 + }, + { + "epoch": 1.679048467116143, + "grad_norm": 1.421032190322876, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8876312971115112, + "num_tokens": 503806219.0, + "step": 13199 + }, + { + "epoch": 1.6791756773947335, + "grad_norm": 1.4778268337249756, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8861349821090698, + "num_tokens": 503846306.0, + "step": 13200 + }, + { + "epoch": 1.679302887673324, + "grad_norm": 1.601109266281128, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8682897686958313, + "num_tokens": 503881315.0, + "step": 13201 + }, + { + "epoch": 1.6794300979519146, + "grad_norm": 1.5543479919433594, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8716785311698914, + "num_tokens": 503918043.0, + "step": 13202 + }, + { + "epoch": 1.679557308230505, + "grad_norm": 1.44290030002594, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8720746636390686, + "num_tokens": 503959896.0, + "step": 13203 + }, + { + "epoch": 1.6796845185090956, + "grad_norm": 1.6740076541900635, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8741244077682495, + "num_tokens": 503997275.0, + "step": 13204 + }, + { + "epoch": 1.6798117287876861, + "grad_norm": 1.5326672792434692, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8949273824691772, + "num_tokens": 504030574.0, + "step": 13205 + }, + { + "epoch": 1.6799389390662767, + "grad_norm": 1.5875970125198364, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8790910840034485, + "num_tokens": 504065663.0, + "step": 13206 + }, + { + "epoch": 1.6800661493448672, + "grad_norm": 1.4480524063110352, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8739144802093506, + "num_tokens": 504105808.0, + "step": 13207 + }, + { + "epoch": 1.6801933596234577, + "grad_norm": 1.3865337371826172, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8777982592582703, + "num_tokens": 504147772.0, + "step": 13208 + }, + { + "epoch": 1.680320569902048, + "grad_norm": 1.508030891418457, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8704400062561035, + "num_tokens": 504186436.0, + "step": 13209 + }, + { + "epoch": 1.6804477801806386, + "grad_norm": 1.4356688261032104, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8831384181976318, + "num_tokens": 504227967.0, + "step": 13210 + }, + { + "epoch": 1.680574990459229, + "grad_norm": 1.4371957778930664, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8811030983924866, + "num_tokens": 504268591.0, + "step": 13211 + }, + { + "epoch": 1.6807022007378196, + "grad_norm": 1.6579902172088623, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8737239837646484, + "num_tokens": 504304774.0, + "step": 13212 + }, + { + "epoch": 1.6808294110164101, + "grad_norm": 1.4670791625976562, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8717576265335083, + "num_tokens": 504345288.0, + "step": 13213 + }, + { + "epoch": 1.6809566212950007, + "grad_norm": 1.687788724899292, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8566917777061462, + "num_tokens": 504377741.0, + "step": 13214 + }, + { + "epoch": 1.681083831573591, + "grad_norm": 1.557296633720398, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.880331814289093, + "num_tokens": 504416620.0, + "step": 13215 + }, + { + "epoch": 1.6812110418521815, + "grad_norm": 1.6242156028747559, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8731359243392944, + "num_tokens": 504452821.0, + "step": 13216 + }, + { + "epoch": 1.681338252130772, + "grad_norm": 1.5678044557571411, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8830212354660034, + "num_tokens": 504488006.0, + "step": 13217 + }, + { + "epoch": 1.6814654624093626, + "grad_norm": 1.427086591720581, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8793826103210449, + "num_tokens": 504534001.0, + "step": 13218 + }, + { + "epoch": 1.681592672687953, + "grad_norm": 1.5693191289901733, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.882622480392456, + "num_tokens": 504567691.0, + "step": 13219 + }, + { + "epoch": 1.6817198829665436, + "grad_norm": 1.5550758838653564, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8639585971832275, + "num_tokens": 504608672.0, + "step": 13220 + }, + { + "epoch": 1.6818470932451342, + "grad_norm": 1.527422547340393, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8579671382904053, + "num_tokens": 504645923.0, + "step": 13221 + }, + { + "epoch": 1.6819743035237247, + "grad_norm": 1.5195928812026978, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8797552585601807, + "num_tokens": 504686651.0, + "step": 13222 + }, + { + "epoch": 1.6821015138023152, + "grad_norm": 1.468946933746338, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8765503168106079, + "num_tokens": 504723977.0, + "step": 13223 + }, + { + "epoch": 1.6822287240809057, + "grad_norm": 1.4787423610687256, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8815721869468689, + "num_tokens": 504759230.0, + "step": 13224 + }, + { + "epoch": 1.6823559343594963, + "grad_norm": 1.419629693031311, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8887313008308411, + "num_tokens": 504798254.0, + "step": 13225 + }, + { + "epoch": 1.6824831446380868, + "grad_norm": 1.5204405784606934, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8778994083404541, + "num_tokens": 504836744.0, + "step": 13226 + }, + { + "epoch": 1.6826103549166773, + "grad_norm": 1.488062858581543, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8858529925346375, + "num_tokens": 504872916.0, + "step": 13227 + }, + { + "epoch": 1.6827375651952678, + "grad_norm": 1.4918458461761475, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8755894899368286, + "num_tokens": 504916913.0, + "step": 13228 + }, + { + "epoch": 1.6828647754738584, + "grad_norm": 1.6728808879852295, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8757122755050659, + "num_tokens": 504949389.0, + "step": 13229 + }, + { + "epoch": 1.682991985752449, + "grad_norm": 1.5429874658584595, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8665771484375, + "num_tokens": 504990297.0, + "step": 13230 + }, + { + "epoch": 1.6831191960310394, + "grad_norm": 1.7233506441116333, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8706182241439819, + "num_tokens": 505019824.0, + "step": 13231 + }, + { + "epoch": 1.68324640630963, + "grad_norm": 1.5935111045837402, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8698172569274902, + "num_tokens": 505055910.0, + "step": 13232 + }, + { + "epoch": 1.6833736165882205, + "grad_norm": 1.5715409517288208, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8765817880630493, + "num_tokens": 505092474.0, + "step": 13233 + }, + { + "epoch": 1.6835008268668108, + "grad_norm": 1.7722480297088623, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8698341250419617, + "num_tokens": 505127926.0, + "step": 13234 + }, + { + "epoch": 1.6836280371454013, + "grad_norm": 1.438788890838623, + "learning_rate": 1e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.9003728628158569, + "num_tokens": 505164242.0, + "step": 13235 + }, + { + "epoch": 1.6837552474239919, + "grad_norm": 1.531383991241455, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8792167901992798, + "num_tokens": 505201069.0, + "step": 13236 + }, + { + "epoch": 1.6838824577025824, + "grad_norm": 1.7497518062591553, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8647066354751587, + "num_tokens": 505231551.0, + "step": 13237 + }, + { + "epoch": 1.684009667981173, + "grad_norm": 1.422838807106018, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8772086501121521, + "num_tokens": 505271448.0, + "step": 13238 + }, + { + "epoch": 1.6841368782597634, + "grad_norm": 1.603934407234192, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8838678598403931, + "num_tokens": 505304371.0, + "step": 13239 + }, + { + "epoch": 1.6842640885383537, + "grad_norm": 1.4877310991287231, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8720863461494446, + "num_tokens": 505343368.0, + "step": 13240 + }, + { + "epoch": 1.6843912988169443, + "grad_norm": 1.516054630279541, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8771934509277344, + "num_tokens": 505382913.0, + "step": 13241 + }, + { + "epoch": 1.6845185090955348, + "grad_norm": 1.46440589427948, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.873161792755127, + "num_tokens": 505422632.0, + "step": 13242 + }, + { + "epoch": 1.6846457193741253, + "grad_norm": 1.498600959777832, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8746633529663086, + "num_tokens": 505458728.0, + "step": 13243 + }, + { + "epoch": 1.6847729296527159, + "grad_norm": 1.3719892501831055, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.890643835067749, + "num_tokens": 505497388.0, + "step": 13244 + }, + { + "epoch": 1.6849001399313064, + "grad_norm": 1.5016065835952759, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8822855353355408, + "num_tokens": 505533479.0, + "step": 13245 + }, + { + "epoch": 1.685027350209897, + "grad_norm": 1.4790258407592773, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8781195282936096, + "num_tokens": 505575060.0, + "step": 13246 + }, + { + "epoch": 1.6851545604884874, + "grad_norm": 1.6052782535552979, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8832716941833496, + "num_tokens": 505608233.0, + "step": 13247 + }, + { + "epoch": 1.685281770767078, + "grad_norm": 1.561249852180481, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8671568036079407, + "num_tokens": 505644256.0, + "step": 13248 + }, + { + "epoch": 1.6854089810456685, + "grad_norm": 1.6370984315872192, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8763388395309448, + "num_tokens": 505678245.0, + "step": 13249 + }, + { + "epoch": 1.685536191324259, + "grad_norm": 1.4634603261947632, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8746416568756104, + "num_tokens": 505719725.0, + "step": 13250 + }, + { + "epoch": 1.6856634016028496, + "grad_norm": 1.5843199491500854, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.871314287185669, + "num_tokens": 505756792.0, + "step": 13251 + }, + { + "epoch": 1.68579061188144, + "grad_norm": 1.4509291648864746, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8803421854972839, + "num_tokens": 505792268.0, + "step": 13252 + }, + { + "epoch": 1.6859178221600306, + "grad_norm": 1.54390287399292, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8882393836975098, + "num_tokens": 505827450.0, + "step": 13253 + }, + { + "epoch": 1.6860450324386211, + "grad_norm": 1.472398042678833, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.87264084815979, + "num_tokens": 505865512.0, + "step": 13254 + }, + { + "epoch": 1.6861722427172117, + "grad_norm": 1.519556999206543, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8839653730392456, + "num_tokens": 505902919.0, + "step": 13255 + }, + { + "epoch": 1.6862994529958022, + "grad_norm": 1.5133016109466553, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8801497220993042, + "num_tokens": 505938624.0, + "step": 13256 + }, + { + "epoch": 1.6864266632743927, + "grad_norm": 1.5251402854919434, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8756100535392761, + "num_tokens": 505975049.0, + "step": 13257 + }, + { + "epoch": 1.686553873552983, + "grad_norm": 1.5384405851364136, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8725947141647339, + "num_tokens": 506015202.0, + "step": 13258 + }, + { + "epoch": 1.6866810838315736, + "grad_norm": 1.7346725463867188, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8424007892608643, + "num_tokens": 506055050.0, + "step": 13259 + }, + { + "epoch": 1.686808294110164, + "grad_norm": 1.569106936454773, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8863787055015564, + "num_tokens": 506087936.0, + "step": 13260 + }, + { + "epoch": 1.6869355043887546, + "grad_norm": 1.4926220178604126, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8724104166030884, + "num_tokens": 506128656.0, + "step": 13261 + }, + { + "epoch": 1.6870627146673451, + "grad_norm": 1.506269097328186, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8874538540840149, + "num_tokens": 506162608.0, + "step": 13262 + }, + { + "epoch": 1.6871899249459357, + "grad_norm": 1.4151322841644287, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8744284510612488, + "num_tokens": 506203428.0, + "step": 13263 + }, + { + "epoch": 1.687317135224526, + "grad_norm": 1.4888932704925537, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8679724335670471, + "num_tokens": 506242688.0, + "step": 13264 + }, + { + "epoch": 1.6874443455031165, + "grad_norm": 1.4133397340774536, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.876194179058075, + "num_tokens": 506283571.0, + "step": 13265 + }, + { + "epoch": 1.687571555781707, + "grad_norm": 1.5113810300827026, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8788076639175415, + "num_tokens": 506316540.0, + "step": 13266 + }, + { + "epoch": 1.6876987660602976, + "grad_norm": 1.4245527982711792, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8751088380813599, + "num_tokens": 506356564.0, + "step": 13267 + }, + { + "epoch": 1.687825976338888, + "grad_norm": 1.575844407081604, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8773133158683777, + "num_tokens": 506392738.0, + "step": 13268 + }, + { + "epoch": 1.6879531866174786, + "grad_norm": 1.6348446607589722, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8628858327865601, + "num_tokens": 506427383.0, + "step": 13269 + }, + { + "epoch": 1.6880803968960691, + "grad_norm": 1.4627013206481934, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8826862573623657, + "num_tokens": 506466842.0, + "step": 13270 + }, + { + "epoch": 1.6882076071746597, + "grad_norm": 1.6462160348892212, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8784477114677429, + "num_tokens": 506498758.0, + "step": 13271 + }, + { + "epoch": 1.6883348174532502, + "grad_norm": 1.5830398797988892, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8691847324371338, + "num_tokens": 506535871.0, + "step": 13272 + }, + { + "epoch": 1.6884620277318407, + "grad_norm": 1.4374942779541016, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.875625729560852, + "num_tokens": 506576894.0, + "step": 13273 + }, + { + "epoch": 1.6885892380104313, + "grad_norm": 1.4712200164794922, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8731231093406677, + "num_tokens": 506615745.0, + "step": 13274 + }, + { + "epoch": 1.6887164482890218, + "grad_norm": 1.4831538200378418, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8782538175582886, + "num_tokens": 506653299.0, + "step": 13275 + }, + { + "epoch": 1.6888436585676123, + "grad_norm": 1.4212331771850586, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8966196775436401, + "num_tokens": 506686861.0, + "step": 13276 + }, + { + "epoch": 1.6889708688462028, + "grad_norm": 1.438083291053772, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8898860812187195, + "num_tokens": 506723499.0, + "step": 13277 + }, + { + "epoch": 1.6890980791247934, + "grad_norm": 1.5395920276641846, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8854148387908936, + "num_tokens": 506761423.0, + "step": 13278 + }, + { + "epoch": 1.689225289403384, + "grad_norm": 1.4940075874328613, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.879806399345398, + "num_tokens": 506798162.0, + "step": 13279 + }, + { + "epoch": 1.6893524996819744, + "grad_norm": 1.37798011302948, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8808416128158569, + "num_tokens": 506839040.0, + "step": 13280 + }, + { + "epoch": 1.689479709960565, + "grad_norm": 1.5680203437805176, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8778876662254333, + "num_tokens": 506873511.0, + "step": 13281 + }, + { + "epoch": 1.6896069202391555, + "grad_norm": 1.4198005199432373, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8852085471153259, + "num_tokens": 506913700.0, + "step": 13282 + }, + { + "epoch": 1.6897341305177458, + "grad_norm": 1.4750577211380005, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8724637031555176, + "num_tokens": 506956769.0, + "step": 13283 + }, + { + "epoch": 1.6898613407963363, + "grad_norm": 1.4287995100021362, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8907511830329895, + "num_tokens": 506989611.0, + "step": 13284 + }, + { + "epoch": 1.6899885510749268, + "grad_norm": 1.5124961137771606, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8854426145553589, + "num_tokens": 507029297.0, + "step": 13285 + }, + { + "epoch": 1.6901157613535174, + "grad_norm": 1.4929832220077515, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8801669478416443, + "num_tokens": 507067565.0, + "step": 13286 + }, + { + "epoch": 1.690242971632108, + "grad_norm": 1.496719241142273, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8740688562393188, + "num_tokens": 507102233.0, + "step": 13287 + }, + { + "epoch": 1.6903701819106984, + "grad_norm": 1.456970453262329, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8820639252662659, + "num_tokens": 507143326.0, + "step": 13288 + }, + { + "epoch": 1.6904973921892887, + "grad_norm": 1.6530423164367676, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8618550896644592, + "num_tokens": 507177723.0, + "step": 13289 + }, + { + "epoch": 1.6906246024678793, + "grad_norm": 1.4450687170028687, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8747169971466064, + "num_tokens": 507218408.0, + "step": 13290 + }, + { + "epoch": 1.6907518127464698, + "grad_norm": 1.63813054561615, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8575592637062073, + "num_tokens": 507256214.0, + "step": 13291 + }, + { + "epoch": 1.6908790230250603, + "grad_norm": 1.7735652923583984, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8754803538322449, + "num_tokens": 507285837.0, + "step": 13292 + }, + { + "epoch": 1.6910062333036509, + "grad_norm": 1.5811549425125122, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8788164258003235, + "num_tokens": 507320136.0, + "step": 13293 + }, + { + "epoch": 1.6911334435822414, + "grad_norm": 1.6301417350769043, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8629646301269531, + "num_tokens": 507354581.0, + "step": 13294 + }, + { + "epoch": 1.691260653860832, + "grad_norm": 1.5345743894577026, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8863885998725891, + "num_tokens": 507389552.0, + "step": 13295 + }, + { + "epoch": 1.6913878641394224, + "grad_norm": 1.3689392805099487, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8863152861595154, + "num_tokens": 507428878.0, + "step": 13296 + }, + { + "epoch": 1.691515074418013, + "grad_norm": 1.4214065074920654, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8689403533935547, + "num_tokens": 507471730.0, + "step": 13297 + }, + { + "epoch": 1.6916422846966035, + "grad_norm": 1.4510687589645386, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8767156600952148, + "num_tokens": 507516465.0, + "step": 13298 + }, + { + "epoch": 1.691769494975194, + "grad_norm": 1.505496859550476, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.881277859210968, + "num_tokens": 507556475.0, + "step": 13299 + }, + { + "epoch": 1.6918967052537845, + "grad_norm": 1.551800012588501, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8844242095947266, + "num_tokens": 507592264.0, + "step": 13300 + }, + { + "epoch": 1.692023915532375, + "grad_norm": 1.3107513189315796, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8845240473747253, + "num_tokens": 507636470.0, + "step": 13301 + }, + { + "epoch": 1.6921511258109656, + "grad_norm": 1.4491976499557495, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8821077346801758, + "num_tokens": 507675415.0, + "step": 13302 + }, + { + "epoch": 1.6922783360895561, + "grad_norm": 1.4931390285491943, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8767076730728149, + "num_tokens": 507717772.0, + "step": 13303 + }, + { + "epoch": 1.6924055463681467, + "grad_norm": 1.610608458518982, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8827526569366455, + "num_tokens": 507751058.0, + "step": 13304 + }, + { + "epoch": 1.6925327566467372, + "grad_norm": 1.5272583961486816, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8700352907180786, + "num_tokens": 507790581.0, + "step": 13305 + }, + { + "epoch": 1.6926599669253277, + "grad_norm": 1.6336485147476196, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8812363147735596, + "num_tokens": 507820897.0, + "step": 13306 + }, + { + "epoch": 1.692787177203918, + "grad_norm": 1.7020187377929688, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.869615375995636, + "num_tokens": 507851331.0, + "step": 13307 + }, + { + "epoch": 1.6929143874825086, + "grad_norm": 1.4766221046447754, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8712259531021118, + "num_tokens": 507887855.0, + "step": 13308 + }, + { + "epoch": 1.693041597761099, + "grad_norm": 1.4743034839630127, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8813826441764832, + "num_tokens": 507927089.0, + "step": 13309 + }, + { + "epoch": 1.6931688080396896, + "grad_norm": 1.5908106565475464, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8652805089950562, + "num_tokens": 507966031.0, + "step": 13310 + }, + { + "epoch": 1.6932960183182801, + "grad_norm": 1.4389103651046753, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.877935528755188, + "num_tokens": 508008980.0, + "step": 13311 + }, + { + "epoch": 1.6934232285968707, + "grad_norm": 1.3810875415802002, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8790819644927979, + "num_tokens": 508052314.0, + "step": 13312 + }, + { + "epoch": 1.693550438875461, + "grad_norm": 1.4514787197113037, + "learning_rate": 1e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.8976379036903381, + "num_tokens": 508088043.0, + "step": 13313 + }, + { + "epoch": 1.6936776491540515, + "grad_norm": 1.3840076923370361, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8887662887573242, + "num_tokens": 508129380.0, + "step": 13314 + }, + { + "epoch": 1.693804859432642, + "grad_norm": 1.4936872720718384, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8884522914886475, + "num_tokens": 508163890.0, + "step": 13315 + }, + { + "epoch": 1.6939320697112326, + "grad_norm": 1.3939586877822876, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.870733380317688, + "num_tokens": 508205624.0, + "step": 13316 + }, + { + "epoch": 1.694059279989823, + "grad_norm": 1.5093423128128052, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8744115829467773, + "num_tokens": 508242360.0, + "step": 13317 + }, + { + "epoch": 1.6941864902684136, + "grad_norm": 1.480689525604248, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8825980424880981, + "num_tokens": 508280258.0, + "step": 13318 + }, + { + "epoch": 1.6943137005470041, + "grad_norm": 1.6775896549224854, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8685455322265625, + "num_tokens": 508313765.0, + "step": 13319 + }, + { + "epoch": 1.6944409108255947, + "grad_norm": 1.467535376548767, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8803678154945374, + "num_tokens": 508352965.0, + "step": 13320 + }, + { + "epoch": 1.6945681211041852, + "grad_norm": 1.5152982473373413, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8726695775985718, + "num_tokens": 508394353.0, + "step": 13321 + }, + { + "epoch": 1.6946953313827757, + "grad_norm": 1.5287820100784302, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8839797377586365, + "num_tokens": 508429855.0, + "step": 13322 + }, + { + "epoch": 1.6948225416613663, + "grad_norm": 1.3986828327178955, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8760993480682373, + "num_tokens": 508472886.0, + "step": 13323 + }, + { + "epoch": 1.6949497519399568, + "grad_norm": 1.6176902055740356, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8509382009506226, + "num_tokens": 508511537.0, + "step": 13324 + }, + { + "epoch": 1.6950769622185473, + "grad_norm": 1.4013923406600952, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8876197934150696, + "num_tokens": 508549069.0, + "step": 13325 + }, + { + "epoch": 1.6952041724971378, + "grad_norm": 1.5054830312728882, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.88390052318573, + "num_tokens": 508584506.0, + "step": 13326 + }, + { + "epoch": 1.6953313827757284, + "grad_norm": 1.6442416906356812, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8928771018981934, + "num_tokens": 508618305.0, + "step": 13327 + }, + { + "epoch": 1.695458593054319, + "grad_norm": 1.5923707485198975, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8887182474136353, + "num_tokens": 508650881.0, + "step": 13328 + }, + { + "epoch": 1.6955858033329094, + "grad_norm": 1.6649994850158691, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8935059309005737, + "num_tokens": 508685769.0, + "step": 13329 + }, + { + "epoch": 1.6957130136115, + "grad_norm": 1.4655383825302124, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8889582753181458, + "num_tokens": 508718953.0, + "step": 13330 + }, + { + "epoch": 1.6958402238900905, + "grad_norm": 1.6364295482635498, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8556725978851318, + "num_tokens": 508756941.0, + "step": 13331 + }, + { + "epoch": 1.6959674341686808, + "grad_norm": 1.4983575344085693, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8744017481803894, + "num_tokens": 508797569.0, + "step": 13332 + }, + { + "epoch": 1.6960946444472713, + "grad_norm": 1.4313591718673706, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8839138746261597, + "num_tokens": 508837181.0, + "step": 13333 + }, + { + "epoch": 1.6962218547258618, + "grad_norm": 1.6039459705352783, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8670387268066406, + "num_tokens": 508873519.0, + "step": 13334 + }, + { + "epoch": 1.6963490650044524, + "grad_norm": 1.4578834772109985, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8616759777069092, + "num_tokens": 508915738.0, + "step": 13335 + }, + { + "epoch": 1.696476275283043, + "grad_norm": 1.615378737449646, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8693580627441406, + "num_tokens": 508950800.0, + "step": 13336 + }, + { + "epoch": 1.6966034855616334, + "grad_norm": 1.474483609199524, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8772892951965332, + "num_tokens": 508988975.0, + "step": 13337 + }, + { + "epoch": 1.6967306958402237, + "grad_norm": 1.4545884132385254, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8843091726303101, + "num_tokens": 509026758.0, + "step": 13338 + }, + { + "epoch": 1.6968579061188143, + "grad_norm": 1.3839750289916992, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8864349126815796, + "num_tokens": 509066088.0, + "step": 13339 + }, + { + "epoch": 1.6969851163974048, + "grad_norm": 1.4861794710159302, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8794820308685303, + "num_tokens": 509108111.0, + "step": 13340 + }, + { + "epoch": 1.6971123266759953, + "grad_norm": 1.4092915058135986, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8785333633422852, + "num_tokens": 509150115.0, + "step": 13341 + }, + { + "epoch": 1.6972395369545858, + "grad_norm": 1.5480786561965942, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8826426267623901, + "num_tokens": 509186787.0, + "step": 13342 + }, + { + "epoch": 1.6973667472331764, + "grad_norm": 1.3838574886322021, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8769612312316895, + "num_tokens": 509231062.0, + "step": 13343 + }, + { + "epoch": 1.697493957511767, + "grad_norm": 1.5544676780700684, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8904857635498047, + "num_tokens": 509265071.0, + "step": 13344 + }, + { + "epoch": 1.6976211677903574, + "grad_norm": 1.5570036172866821, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8736389875411987, + "num_tokens": 509302714.0, + "step": 13345 + }, + { + "epoch": 1.697748378068948, + "grad_norm": 1.4984334707260132, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8809746503829956, + "num_tokens": 509341094.0, + "step": 13346 + }, + { + "epoch": 1.6978755883475385, + "grad_norm": 1.5195177793502808, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.877498209476471, + "num_tokens": 509378845.0, + "step": 13347 + }, + { + "epoch": 1.698002798626129, + "grad_norm": 1.668485164642334, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8752747178077698, + "num_tokens": 509410904.0, + "step": 13348 + }, + { + "epoch": 1.6981300089047195, + "grad_norm": 1.4814389944076538, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8870886564254761, + "num_tokens": 509450488.0, + "step": 13349 + }, + { + "epoch": 1.69825721918331, + "grad_norm": 1.5516479015350342, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8617212176322937, + "num_tokens": 509490326.0, + "step": 13350 + }, + { + "epoch": 1.6983844294619006, + "grad_norm": 1.3182427883148193, + "learning_rate": 1e-06, + "loss": 0.2736, + "mean_token_accuracy": 0.8994412422180176, + "num_tokens": 509529772.0, + "step": 13351 + }, + { + "epoch": 1.6985116397404911, + "grad_norm": 1.4993743896484375, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8879948854446411, + "num_tokens": 509563294.0, + "step": 13352 + }, + { + "epoch": 1.6986388500190817, + "grad_norm": 1.515758752822876, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8810265064239502, + "num_tokens": 509598697.0, + "step": 13353 + }, + { + "epoch": 1.6987660602976722, + "grad_norm": 1.4458953142166138, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8806271553039551, + "num_tokens": 509634800.0, + "step": 13354 + }, + { + "epoch": 1.6988932705762627, + "grad_norm": 1.4477365016937256, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8757985830307007, + "num_tokens": 509676189.0, + "step": 13355 + }, + { + "epoch": 1.699020480854853, + "grad_norm": 1.585699439048767, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8662607669830322, + "num_tokens": 509710400.0, + "step": 13356 + }, + { + "epoch": 1.6991476911334435, + "grad_norm": 1.4370744228363037, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8788509368896484, + "num_tokens": 509750328.0, + "step": 13357 + }, + { + "epoch": 1.699274901412034, + "grad_norm": 1.583886981010437, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.866341233253479, + "num_tokens": 509788256.0, + "step": 13358 + }, + { + "epoch": 1.6994021116906246, + "grad_norm": 1.3629987239837646, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8932098150253296, + "num_tokens": 509830440.0, + "step": 13359 + }, + { + "epoch": 1.6995293219692151, + "grad_norm": 1.603010654449463, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8869541883468628, + "num_tokens": 509870504.0, + "step": 13360 + }, + { + "epoch": 1.6996565322478057, + "grad_norm": 1.5273748636245728, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8737355470657349, + "num_tokens": 509908330.0, + "step": 13361 + }, + { + "epoch": 1.699783742526396, + "grad_norm": 1.3863823413848877, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.884408712387085, + "num_tokens": 509950376.0, + "step": 13362 + }, + { + "epoch": 1.6999109528049865, + "grad_norm": 1.5879777669906616, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8739635944366455, + "num_tokens": 509987526.0, + "step": 13363 + }, + { + "epoch": 1.700038163083577, + "grad_norm": 1.4503875970840454, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8790609836578369, + "num_tokens": 510026853.0, + "step": 13364 + }, + { + "epoch": 1.7001653733621676, + "grad_norm": 1.6221811771392822, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8817278742790222, + "num_tokens": 510061455.0, + "step": 13365 + }, + { + "epoch": 1.700292583640758, + "grad_norm": 1.5113296508789062, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8697134256362915, + "num_tokens": 510100834.0, + "step": 13366 + }, + { + "epoch": 1.7004197939193486, + "grad_norm": 1.591617465019226, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8724135756492615, + "num_tokens": 510140686.0, + "step": 13367 + }, + { + "epoch": 1.7005470041979391, + "grad_norm": 1.5537227392196655, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.876388430595398, + "num_tokens": 510176481.0, + "step": 13368 + }, + { + "epoch": 1.7006742144765297, + "grad_norm": 1.4815646409988403, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8860805034637451, + "num_tokens": 510210644.0, + "step": 13369 + }, + { + "epoch": 1.7008014247551202, + "grad_norm": 1.5471930503845215, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8660202026367188, + "num_tokens": 510248314.0, + "step": 13370 + }, + { + "epoch": 1.7009286350337107, + "grad_norm": 1.5162279605865479, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8799421787261963, + "num_tokens": 510283249.0, + "step": 13371 + }, + { + "epoch": 1.7010558453123013, + "grad_norm": 1.622326135635376, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8682519197463989, + "num_tokens": 510319002.0, + "step": 13372 + }, + { + "epoch": 1.7011830555908918, + "grad_norm": 1.4919480085372925, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.875029981136322, + "num_tokens": 510356582.0, + "step": 13373 + }, + { + "epoch": 1.7013102658694823, + "grad_norm": 1.4176307916641235, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8658764362335205, + "num_tokens": 510400253.0, + "step": 13374 + }, + { + "epoch": 1.7014374761480728, + "grad_norm": 1.3647140264511108, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8806904554367065, + "num_tokens": 510445312.0, + "step": 13375 + }, + { + "epoch": 1.7015646864266634, + "grad_norm": 1.465112328529358, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8850644826889038, + "num_tokens": 510483215.0, + "step": 13376 + }, + { + "epoch": 1.701691896705254, + "grad_norm": 1.4504761695861816, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8727788329124451, + "num_tokens": 510524441.0, + "step": 13377 + }, + { + "epoch": 1.7018191069838444, + "grad_norm": 1.5505315065383911, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8693039417266846, + "num_tokens": 510564029.0, + "step": 13378 + }, + { + "epoch": 1.701946317262435, + "grad_norm": 1.5182830095291138, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8705797791481018, + "num_tokens": 510607686.0, + "step": 13379 + }, + { + "epoch": 1.7020735275410255, + "grad_norm": 1.5138747692108154, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8882091641426086, + "num_tokens": 510639752.0, + "step": 13380 + }, + { + "epoch": 1.7022007378196158, + "grad_norm": 1.5719590187072754, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.880517840385437, + "num_tokens": 510674486.0, + "step": 13381 + }, + { + "epoch": 1.7023279480982063, + "grad_norm": 1.3917039632797241, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8869360685348511, + "num_tokens": 510712036.0, + "step": 13382 + }, + { + "epoch": 1.7024551583767968, + "grad_norm": 1.3861302137374878, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.896589457988739, + "num_tokens": 510752366.0, + "step": 13383 + }, + { + "epoch": 1.7025823686553874, + "grad_norm": 1.3962554931640625, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8774898052215576, + "num_tokens": 510794032.0, + "step": 13384 + }, + { + "epoch": 1.702709578933978, + "grad_norm": 1.5496665239334106, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8830029964447021, + "num_tokens": 510828715.0, + "step": 13385 + }, + { + "epoch": 1.7028367892125684, + "grad_norm": 1.6638679504394531, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8721324801445007, + "num_tokens": 510862611.0, + "step": 13386 + }, + { + "epoch": 1.7029639994911587, + "grad_norm": 1.5726884603500366, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8727430105209351, + "num_tokens": 510902668.0, + "step": 13387 + }, + { + "epoch": 1.7030912097697493, + "grad_norm": 1.5479803085327148, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8812559843063354, + "num_tokens": 510942611.0, + "step": 13388 + }, + { + "epoch": 1.7032184200483398, + "grad_norm": 1.4233293533325195, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8937574625015259, + "num_tokens": 510980412.0, + "step": 13389 + }, + { + "epoch": 1.7033456303269303, + "grad_norm": 1.5079293251037598, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8725571632385254, + "num_tokens": 511016854.0, + "step": 13390 + }, + { + "epoch": 1.7034728406055208, + "grad_norm": 1.514196515083313, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8818788528442383, + "num_tokens": 511054378.0, + "step": 13391 + }, + { + "epoch": 1.7036000508841114, + "grad_norm": 1.4259510040283203, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8751161098480225, + "num_tokens": 511095532.0, + "step": 13392 + }, + { + "epoch": 1.703727261162702, + "grad_norm": 1.4579782485961914, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.876912534236908, + "num_tokens": 511133611.0, + "step": 13393 + }, + { + "epoch": 1.7038544714412924, + "grad_norm": 1.5152342319488525, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8821909427642822, + "num_tokens": 511170024.0, + "step": 13394 + }, + { + "epoch": 1.703981681719883, + "grad_norm": 1.6280393600463867, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8670370578765869, + "num_tokens": 511205219.0, + "step": 13395 + }, + { + "epoch": 1.7041088919984735, + "grad_norm": 1.385167121887207, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8886337280273438, + "num_tokens": 511248575.0, + "step": 13396 + }, + { + "epoch": 1.704236102277064, + "grad_norm": 1.5762678384780884, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.870897650718689, + "num_tokens": 511283715.0, + "step": 13397 + }, + { + "epoch": 1.7043633125556545, + "grad_norm": 1.6158820390701294, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8825897574424744, + "num_tokens": 511317917.0, + "step": 13398 + }, + { + "epoch": 1.704490522834245, + "grad_norm": 1.4323190450668335, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8827661275863647, + "num_tokens": 511358312.0, + "step": 13399 + }, + { + "epoch": 1.7046177331128356, + "grad_norm": 1.4652092456817627, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8871618509292603, + "num_tokens": 511397066.0, + "step": 13400 + }, + { + "epoch": 1.7047449433914261, + "grad_norm": 1.6516900062561035, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8686700463294983, + "num_tokens": 511431319.0, + "step": 13401 + }, + { + "epoch": 1.7048721536700167, + "grad_norm": 1.4386894702911377, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8937402963638306, + "num_tokens": 511465831.0, + "step": 13402 + }, + { + "epoch": 1.7049993639486072, + "grad_norm": 1.5898035764694214, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8716129064559937, + "num_tokens": 511499832.0, + "step": 13403 + }, + { + "epoch": 1.7051265742271977, + "grad_norm": 1.551525592803955, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.889305591583252, + "num_tokens": 511538193.0, + "step": 13404 + }, + { + "epoch": 1.705253784505788, + "grad_norm": 1.4409385919570923, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.864957332611084, + "num_tokens": 511578397.0, + "step": 13405 + }, + { + "epoch": 1.7053809947843785, + "grad_norm": 1.374772548675537, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8938747048377991, + "num_tokens": 511618515.0, + "step": 13406 + }, + { + "epoch": 1.705508205062969, + "grad_norm": 1.802377700805664, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8711041808128357, + "num_tokens": 511659992.0, + "step": 13407 + }, + { + "epoch": 1.7056354153415596, + "grad_norm": 1.5170139074325562, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8666573762893677, + "num_tokens": 511700733.0, + "step": 13408 + }, + { + "epoch": 1.7057626256201501, + "grad_norm": 1.7281904220581055, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8551440238952637, + "num_tokens": 511733557.0, + "step": 13409 + }, + { + "epoch": 1.7058898358987407, + "grad_norm": 1.5236221551895142, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8842238187789917, + "num_tokens": 511769511.0, + "step": 13410 + }, + { + "epoch": 1.706017046177331, + "grad_norm": 1.5330781936645508, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8705970048904419, + "num_tokens": 511805340.0, + "step": 13411 + }, + { + "epoch": 1.7061442564559215, + "grad_norm": 1.5678277015686035, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8786377906799316, + "num_tokens": 511839760.0, + "step": 13412 + }, + { + "epoch": 1.706271466734512, + "grad_norm": 1.5557435750961304, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8668541312217712, + "num_tokens": 511879400.0, + "step": 13413 + }, + { + "epoch": 1.7063986770131025, + "grad_norm": 1.602333664894104, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8842289447784424, + "num_tokens": 511914857.0, + "step": 13414 + }, + { + "epoch": 1.706525887291693, + "grad_norm": 1.5903480052947998, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8850863575935364, + "num_tokens": 511948521.0, + "step": 13415 + }, + { + "epoch": 1.7066530975702836, + "grad_norm": 1.574419379234314, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8958668112754822, + "num_tokens": 511979390.0, + "step": 13416 + }, + { + "epoch": 1.7067803078488741, + "grad_norm": 1.5784465074539185, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8756003379821777, + "num_tokens": 512014973.0, + "step": 13417 + }, + { + "epoch": 1.7069075181274647, + "grad_norm": 1.5790659189224243, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.862436056137085, + "num_tokens": 512051482.0, + "step": 13418 + }, + { + "epoch": 1.7070347284060552, + "grad_norm": 1.651924729347229, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8768083453178406, + "num_tokens": 512081519.0, + "step": 13419 + }, + { + "epoch": 1.7071619386846457, + "grad_norm": 1.4748209714889526, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8747061491012573, + "num_tokens": 512121182.0, + "step": 13420 + }, + { + "epoch": 1.7072891489632362, + "grad_norm": 1.4023405313491821, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8786624670028687, + "num_tokens": 512161841.0, + "step": 13421 + }, + { + "epoch": 1.7074163592418268, + "grad_norm": 1.5129752159118652, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.878138542175293, + "num_tokens": 512197618.0, + "step": 13422 + }, + { + "epoch": 1.7075435695204173, + "grad_norm": 1.4826440811157227, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.885918378829956, + "num_tokens": 512233357.0, + "step": 13423 + }, + { + "epoch": 1.7076707797990078, + "grad_norm": 1.521714210510254, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.877659797668457, + "num_tokens": 512269052.0, + "step": 13424 + }, + { + "epoch": 1.7077979900775984, + "grad_norm": 1.4519199132919312, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.875641942024231, + "num_tokens": 512306789.0, + "step": 13425 + }, + { + "epoch": 1.7079252003561889, + "grad_norm": 1.4864592552185059, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8855941891670227, + "num_tokens": 512341877.0, + "step": 13426 + }, + { + "epoch": 1.7080524106347794, + "grad_norm": 1.5888150930404663, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8726959824562073, + "num_tokens": 512382279.0, + "step": 13427 + }, + { + "epoch": 1.70817962091337, + "grad_norm": 1.486849069595337, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8825217485427856, + "num_tokens": 512421006.0, + "step": 13428 + }, + { + "epoch": 1.7083068311919605, + "grad_norm": 1.4891010522842407, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8882700204849243, + "num_tokens": 512459894.0, + "step": 13429 + }, + { + "epoch": 1.7084340414705508, + "grad_norm": 1.673067569732666, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8746228814125061, + "num_tokens": 512489747.0, + "step": 13430 + }, + { + "epoch": 1.7085612517491413, + "grad_norm": 1.5754369497299194, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8933528661727905, + "num_tokens": 512521639.0, + "step": 13431 + }, + { + "epoch": 1.7086884620277318, + "grad_norm": 1.427117109298706, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8828834295272827, + "num_tokens": 512566068.0, + "step": 13432 + }, + { + "epoch": 1.7088156723063224, + "grad_norm": 1.4928641319274902, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8771058320999146, + "num_tokens": 512603729.0, + "step": 13433 + }, + { + "epoch": 1.708942882584913, + "grad_norm": 1.5453591346740723, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8752011060714722, + "num_tokens": 512643633.0, + "step": 13434 + }, + { + "epoch": 1.7090700928635034, + "grad_norm": 1.4081459045410156, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8908267617225647, + "num_tokens": 512682255.0, + "step": 13435 + }, + { + "epoch": 1.7091973031420937, + "grad_norm": 1.4274381399154663, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8688609004020691, + "num_tokens": 512725812.0, + "step": 13436 + }, + { + "epoch": 1.7093245134206843, + "grad_norm": 1.5458693504333496, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8738335371017456, + "num_tokens": 512762557.0, + "step": 13437 + }, + { + "epoch": 1.7094517236992748, + "grad_norm": 1.348676085472107, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8781926035881042, + "num_tokens": 512805794.0, + "step": 13438 + }, + { + "epoch": 1.7095789339778653, + "grad_norm": 1.4405300617218018, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8916433453559875, + "num_tokens": 512843155.0, + "step": 13439 + }, + { + "epoch": 1.7097061442564558, + "grad_norm": 1.5071280002593994, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.870686411857605, + "num_tokens": 512882622.0, + "step": 13440 + }, + { + "epoch": 1.7098333545350464, + "grad_norm": 1.4348161220550537, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8781796097755432, + "num_tokens": 512922098.0, + "step": 13441 + }, + { + "epoch": 1.709960564813637, + "grad_norm": 1.501875400543213, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.869866669178009, + "num_tokens": 512961372.0, + "step": 13442 + }, + { + "epoch": 1.7100877750922274, + "grad_norm": 1.471883773803711, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8799266219139099, + "num_tokens": 513003108.0, + "step": 13443 + }, + { + "epoch": 1.710214985370818, + "grad_norm": 1.6904317140579224, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8659644722938538, + "num_tokens": 513037916.0, + "step": 13444 + }, + { + "epoch": 1.7103421956494085, + "grad_norm": 1.6180046796798706, + "learning_rate": 1e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.8963370323181152, + "num_tokens": 513068105.0, + "step": 13445 + }, + { + "epoch": 1.710469405927999, + "grad_norm": 1.6087523698806763, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8708452582359314, + "num_tokens": 513103212.0, + "step": 13446 + }, + { + "epoch": 1.7105966162065895, + "grad_norm": 1.5075736045837402, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8845925331115723, + "num_tokens": 513144682.0, + "step": 13447 + }, + { + "epoch": 1.71072382648518, + "grad_norm": 1.4271105527877808, + "learning_rate": 1e-06, + "loss": 0.281, + "mean_token_accuracy": 0.8988698720932007, + "num_tokens": 513179557.0, + "step": 13448 + }, + { + "epoch": 1.7108510367637706, + "grad_norm": 1.4734413623809814, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8847503662109375, + "num_tokens": 513221124.0, + "step": 13449 + }, + { + "epoch": 1.7109782470423611, + "grad_norm": 1.3715112209320068, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8829046487808228, + "num_tokens": 513269673.0, + "step": 13450 + }, + { + "epoch": 1.7111054573209517, + "grad_norm": 1.5768307447433472, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8851636648178101, + "num_tokens": 513307847.0, + "step": 13451 + }, + { + "epoch": 1.7112326675995422, + "grad_norm": 1.542290210723877, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8662421703338623, + "num_tokens": 513344729.0, + "step": 13452 + }, + { + "epoch": 1.7113598778781327, + "grad_norm": 1.3325614929199219, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8929304480552673, + "num_tokens": 513385211.0, + "step": 13453 + }, + { + "epoch": 1.711487088156723, + "grad_norm": 1.6343622207641602, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8810173273086548, + "num_tokens": 513419472.0, + "step": 13454 + }, + { + "epoch": 1.7116142984353135, + "grad_norm": 1.4113612174987793, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8763895034790039, + "num_tokens": 513459672.0, + "step": 13455 + }, + { + "epoch": 1.711741508713904, + "grad_norm": 1.5676169395446777, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8824518918991089, + "num_tokens": 513495344.0, + "step": 13456 + }, + { + "epoch": 1.7118687189924946, + "grad_norm": 1.5105515718460083, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8789809942245483, + "num_tokens": 513535034.0, + "step": 13457 + }, + { + "epoch": 1.7119959292710851, + "grad_norm": 1.556785225868225, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8706245422363281, + "num_tokens": 513572037.0, + "step": 13458 + }, + { + "epoch": 1.7121231395496757, + "grad_norm": 1.4535642862319946, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8601272106170654, + "num_tokens": 513615150.0, + "step": 13459 + }, + { + "epoch": 1.712250349828266, + "grad_norm": 1.382373332977295, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8794347047805786, + "num_tokens": 513659656.0, + "step": 13460 + }, + { + "epoch": 1.7123775601068565, + "grad_norm": 1.491914987564087, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8743557929992676, + "num_tokens": 513695916.0, + "step": 13461 + }, + { + "epoch": 1.712504770385447, + "grad_norm": 1.5157848596572876, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8630141615867615, + "num_tokens": 513735425.0, + "step": 13462 + }, + { + "epoch": 1.7126319806640375, + "grad_norm": 1.4352229833602905, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8727165460586548, + "num_tokens": 513779118.0, + "step": 13463 + }, + { + "epoch": 1.712759190942628, + "grad_norm": 1.6004416942596436, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8846381306648254, + "num_tokens": 513811184.0, + "step": 13464 + }, + { + "epoch": 1.7128864012212186, + "grad_norm": 1.4562727212905884, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8855972290039062, + "num_tokens": 513847022.0, + "step": 13465 + }, + { + "epoch": 1.7130136114998091, + "grad_norm": 1.58524489402771, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8638427257537842, + "num_tokens": 513884595.0, + "step": 13466 + }, + { + "epoch": 1.7131408217783997, + "grad_norm": 1.5931167602539062, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8847349882125854, + "num_tokens": 513920209.0, + "step": 13467 + }, + { + "epoch": 1.7132680320569902, + "grad_norm": 1.5132449865341187, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8844113349914551, + "num_tokens": 513956122.0, + "step": 13468 + }, + { + "epoch": 1.7133952423355807, + "grad_norm": 1.4679489135742188, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8733654022216797, + "num_tokens": 513996755.0, + "step": 13469 + }, + { + "epoch": 1.7135224526141712, + "grad_norm": 1.4962565898895264, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8745114207267761, + "num_tokens": 514036615.0, + "step": 13470 + }, + { + "epoch": 1.7136496628927618, + "grad_norm": 1.5085902214050293, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8813179135322571, + "num_tokens": 514075662.0, + "step": 13471 + }, + { + "epoch": 1.7137768731713523, + "grad_norm": 1.5349522829055786, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8926925659179688, + "num_tokens": 514112908.0, + "step": 13472 + }, + { + "epoch": 1.7139040834499428, + "grad_norm": 1.5076907873153687, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8678061962127686, + "num_tokens": 514152700.0, + "step": 13473 + }, + { + "epoch": 1.7140312937285334, + "grad_norm": 1.4184871912002563, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8921054005622864, + "num_tokens": 514188065.0, + "step": 13474 + }, + { + "epoch": 1.7141585040071239, + "grad_norm": 1.6174933910369873, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8797457218170166, + "num_tokens": 514222969.0, + "step": 13475 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 1.5556647777557373, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.874354362487793, + "num_tokens": 514260315.0, + "step": 13476 + }, + { + "epoch": 1.714412924564305, + "grad_norm": 1.5197643041610718, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8834699392318726, + "num_tokens": 514295013.0, + "step": 13477 + }, + { + "epoch": 1.7145401348428955, + "grad_norm": 1.429936408996582, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8858689665794373, + "num_tokens": 514331892.0, + "step": 13478 + }, + { + "epoch": 1.7146673451214858, + "grad_norm": 1.5290035009384155, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8797428607940674, + "num_tokens": 514367875.0, + "step": 13479 + }, + { + "epoch": 1.7147945554000763, + "grad_norm": 1.5313222408294678, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8795220851898193, + "num_tokens": 514403878.0, + "step": 13480 + }, + { + "epoch": 1.7149217656786668, + "grad_norm": 1.4578794240951538, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8589210510253906, + "num_tokens": 514448173.0, + "step": 13481 + }, + { + "epoch": 1.7150489759572574, + "grad_norm": 1.5036125183105469, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8693881034851074, + "num_tokens": 514488005.0, + "step": 13482 + }, + { + "epoch": 1.7151761862358479, + "grad_norm": 1.5269008874893188, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8739330768585205, + "num_tokens": 514526910.0, + "step": 13483 + }, + { + "epoch": 1.7153033965144384, + "grad_norm": 1.5188522338867188, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8766413927078247, + "num_tokens": 514569026.0, + "step": 13484 + }, + { + "epoch": 1.7154306067930287, + "grad_norm": 1.4717650413513184, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8736420273780823, + "num_tokens": 514610300.0, + "step": 13485 + }, + { + "epoch": 1.7155578170716193, + "grad_norm": 1.7041219472885132, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8838603496551514, + "num_tokens": 514645280.0, + "step": 13486 + }, + { + "epoch": 1.7156850273502098, + "grad_norm": 1.5171810388565063, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8841662406921387, + "num_tokens": 514679882.0, + "step": 13487 + }, + { + "epoch": 1.7158122376288003, + "grad_norm": 1.4874908924102783, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8794018030166626, + "num_tokens": 514718641.0, + "step": 13488 + }, + { + "epoch": 1.7159394479073908, + "grad_norm": 1.468178391456604, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8918971419334412, + "num_tokens": 514755115.0, + "step": 13489 + }, + { + "epoch": 1.7160666581859814, + "grad_norm": 1.542375922203064, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8848393559455872, + "num_tokens": 514787966.0, + "step": 13490 + }, + { + "epoch": 1.716193868464572, + "grad_norm": 1.63433039188385, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8776212334632874, + "num_tokens": 514818679.0, + "step": 13491 + }, + { + "epoch": 1.7163210787431624, + "grad_norm": 1.5998613834381104, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8492619395256042, + "num_tokens": 514857454.0, + "step": 13492 + }, + { + "epoch": 1.716448289021753, + "grad_norm": 1.4471020698547363, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8849664330482483, + "num_tokens": 514898312.0, + "step": 13493 + }, + { + "epoch": 1.7165754993003435, + "grad_norm": 1.4812426567077637, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8715994358062744, + "num_tokens": 514935151.0, + "step": 13494 + }, + { + "epoch": 1.716702709578934, + "grad_norm": 1.4399791955947876, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8789781332015991, + "num_tokens": 514975352.0, + "step": 13495 + }, + { + "epoch": 1.7168299198575245, + "grad_norm": 1.5870846509933472, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8798495531082153, + "num_tokens": 515007513.0, + "step": 13496 + }, + { + "epoch": 1.716957130136115, + "grad_norm": 1.4683619737625122, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8867806792259216, + "num_tokens": 515044440.0, + "step": 13497 + }, + { + "epoch": 1.7170843404147056, + "grad_norm": 1.39833664894104, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8768121600151062, + "num_tokens": 515083755.0, + "step": 13498 + }, + { + "epoch": 1.7172115506932961, + "grad_norm": 1.3986443281173706, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8822322487831116, + "num_tokens": 515122301.0, + "step": 13499 + }, + { + "epoch": 1.7173387609718866, + "grad_norm": 1.5037331581115723, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8664466142654419, + "num_tokens": 515161373.0, + "step": 13500 + }, + { + "epoch": 1.7174659712504772, + "grad_norm": 1.5008044242858887, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8766824007034302, + "num_tokens": 515196345.0, + "step": 13501 + }, + { + "epoch": 1.7175931815290677, + "grad_norm": 1.5106792449951172, + "learning_rate": 1e-06, + "loss": 0.2862, + "mean_token_accuracy": 0.8971741199493408, + "num_tokens": 515231282.0, + "step": 13502 + }, + { + "epoch": 1.717720391807658, + "grad_norm": 1.4502462148666382, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8793560862541199, + "num_tokens": 515268733.0, + "step": 13503 + }, + { + "epoch": 1.7178476020862485, + "grad_norm": 1.4736568927764893, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8775029182434082, + "num_tokens": 515309100.0, + "step": 13504 + }, + { + "epoch": 1.717974812364839, + "grad_norm": 1.6671589612960815, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8828608393669128, + "num_tokens": 515342018.0, + "step": 13505 + }, + { + "epoch": 1.7181020226434296, + "grad_norm": 1.589486837387085, + "learning_rate": 1e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.897628664970398, + "num_tokens": 515373825.0, + "step": 13506 + }, + { + "epoch": 1.7182292329220201, + "grad_norm": 1.5239826440811157, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8790346384048462, + "num_tokens": 515410986.0, + "step": 13507 + }, + { + "epoch": 1.7183564432006107, + "grad_norm": 1.624321699142456, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8768212795257568, + "num_tokens": 515449730.0, + "step": 13508 + }, + { + "epoch": 1.718483653479201, + "grad_norm": 1.7889994382858276, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8530866503715515, + "num_tokens": 515487654.0, + "step": 13509 + }, + { + "epoch": 1.7186108637577915, + "grad_norm": 1.5602726936340332, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8699218034744263, + "num_tokens": 515524550.0, + "step": 13510 + }, + { + "epoch": 1.718738074036382, + "grad_norm": 1.5172269344329834, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8818930983543396, + "num_tokens": 515563291.0, + "step": 13511 + }, + { + "epoch": 1.7188652843149725, + "grad_norm": 1.7037773132324219, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8801335096359253, + "num_tokens": 515599583.0, + "step": 13512 + }, + { + "epoch": 1.718992494593563, + "grad_norm": 1.6679627895355225, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8605629801750183, + "num_tokens": 515637328.0, + "step": 13513 + }, + { + "epoch": 1.7191197048721536, + "grad_norm": 1.652516484260559, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8574365973472595, + "num_tokens": 515675549.0, + "step": 13514 + }, + { + "epoch": 1.7192469151507441, + "grad_norm": 1.4501090049743652, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8639296293258667, + "num_tokens": 515719310.0, + "step": 13515 + }, + { + "epoch": 1.7193741254293347, + "grad_norm": 1.604868769645691, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.872833788394928, + "num_tokens": 515752434.0, + "step": 13516 + }, + { + "epoch": 1.7195013357079252, + "grad_norm": 1.4540233612060547, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8917180895805359, + "num_tokens": 515787710.0, + "step": 13517 + }, + { + "epoch": 1.7196285459865157, + "grad_norm": 1.520134449005127, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8658843040466309, + "num_tokens": 515830285.0, + "step": 13518 + }, + { + "epoch": 1.7197557562651062, + "grad_norm": 1.609303593635559, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8842439651489258, + "num_tokens": 515862687.0, + "step": 13519 + }, + { + "epoch": 1.7198829665436968, + "grad_norm": 1.3475502729415894, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8806279301643372, + "num_tokens": 515908468.0, + "step": 13520 + }, + { + "epoch": 1.7200101768222873, + "grad_norm": 1.430229902267456, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8821673393249512, + "num_tokens": 515944765.0, + "step": 13521 + }, + { + "epoch": 1.7201373871008778, + "grad_norm": 1.329228162765503, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8806807994842529, + "num_tokens": 515988005.0, + "step": 13522 + }, + { + "epoch": 1.7202645973794684, + "grad_norm": 1.5254733562469482, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8758950233459473, + "num_tokens": 516026198.0, + "step": 13523 + }, + { + "epoch": 1.7203918076580589, + "grad_norm": 1.5086023807525635, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8632941842079163, + "num_tokens": 516062340.0, + "step": 13524 + }, + { + "epoch": 1.7205190179366494, + "grad_norm": 1.4927924871444702, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8874130845069885, + "num_tokens": 516096868.0, + "step": 13525 + }, + { + "epoch": 1.72064622821524, + "grad_norm": 1.5718331336975098, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8827283382415771, + "num_tokens": 516131238.0, + "step": 13526 + }, + { + "epoch": 1.7207734384938305, + "grad_norm": 1.5165263414382935, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8736752271652222, + "num_tokens": 516168918.0, + "step": 13527 + }, + { + "epoch": 1.7209006487724208, + "grad_norm": 1.5861835479736328, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.886189877986908, + "num_tokens": 516200385.0, + "step": 13528 + }, + { + "epoch": 1.7210278590510113, + "grad_norm": 1.9565067291259766, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8589800596237183, + "num_tokens": 516227887.0, + "step": 13529 + }, + { + "epoch": 1.7211550693296018, + "grad_norm": 1.3971420526504517, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8906100988388062, + "num_tokens": 516270148.0, + "step": 13530 + }, + { + "epoch": 1.7212822796081924, + "grad_norm": 1.59781014919281, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8738299608230591, + "num_tokens": 516304751.0, + "step": 13531 + }, + { + "epoch": 1.7214094898867829, + "grad_norm": 1.4072656631469727, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8907189965248108, + "num_tokens": 516340745.0, + "step": 13532 + }, + { + "epoch": 1.7215367001653734, + "grad_norm": 1.4690096378326416, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8818093538284302, + "num_tokens": 516378022.0, + "step": 13533 + }, + { + "epoch": 1.7216639104439637, + "grad_norm": 1.5566959381103516, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.877116322517395, + "num_tokens": 516414669.0, + "step": 13534 + }, + { + "epoch": 1.7217911207225542, + "grad_norm": 1.691780686378479, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8784874677658081, + "num_tokens": 516446595.0, + "step": 13535 + }, + { + "epoch": 1.7219183310011448, + "grad_norm": 1.4808564186096191, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.8961015343666077, + "num_tokens": 516481160.0, + "step": 13536 + }, + { + "epoch": 1.7220455412797353, + "grad_norm": 1.5260252952575684, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8625373244285583, + "num_tokens": 516521000.0, + "step": 13537 + }, + { + "epoch": 1.7221727515583258, + "grad_norm": 1.3810116052627563, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.885398268699646, + "num_tokens": 516558991.0, + "step": 13538 + }, + { + "epoch": 1.7222999618369164, + "grad_norm": 1.5257775783538818, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8829689621925354, + "num_tokens": 516595352.0, + "step": 13539 + }, + { + "epoch": 1.7224271721155069, + "grad_norm": 1.463999629020691, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8793711066246033, + "num_tokens": 516635100.0, + "step": 13540 + }, + { + "epoch": 1.7225543823940974, + "grad_norm": 1.5512454509735107, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8901939988136292, + "num_tokens": 516668474.0, + "step": 13541 + }, + { + "epoch": 1.722681592672688, + "grad_norm": 1.4448456764221191, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.886711835861206, + "num_tokens": 516708909.0, + "step": 13542 + }, + { + "epoch": 1.7228088029512785, + "grad_norm": 1.4955946207046509, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8801764249801636, + "num_tokens": 516746760.0, + "step": 13543 + }, + { + "epoch": 1.722936013229869, + "grad_norm": 1.3393701314926147, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8937721848487854, + "num_tokens": 516788997.0, + "step": 13544 + }, + { + "epoch": 1.7230632235084595, + "grad_norm": 1.4306416511535645, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8713324666023254, + "num_tokens": 516830213.0, + "step": 13545 + }, + { + "epoch": 1.72319043378705, + "grad_norm": 1.3290960788726807, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8727534413337708, + "num_tokens": 516875539.0, + "step": 13546 + }, + { + "epoch": 1.7233176440656406, + "grad_norm": 1.6071677207946777, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8851059675216675, + "num_tokens": 516905794.0, + "step": 13547 + }, + { + "epoch": 1.7234448543442311, + "grad_norm": 1.5279630422592163, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8675613403320312, + "num_tokens": 516945835.0, + "step": 13548 + }, + { + "epoch": 1.7235720646228216, + "grad_norm": 1.4977831840515137, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.871656596660614, + "num_tokens": 516987746.0, + "step": 13549 + }, + { + "epoch": 1.7236992749014122, + "grad_norm": 1.411197304725647, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8609097003936768, + "num_tokens": 517034185.0, + "step": 13550 + }, + { + "epoch": 1.7238264851800027, + "grad_norm": 1.5433505773544312, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.880695104598999, + "num_tokens": 517071756.0, + "step": 13551 + }, + { + "epoch": 1.723953695458593, + "grad_norm": 1.6090710163116455, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8706142902374268, + "num_tokens": 517105872.0, + "step": 13552 + }, + { + "epoch": 1.7240809057371835, + "grad_norm": 1.4267396926879883, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8682441711425781, + "num_tokens": 517146266.0, + "step": 13553 + }, + { + "epoch": 1.724208116015774, + "grad_norm": 1.4727998971939087, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8756868243217468, + "num_tokens": 517185954.0, + "step": 13554 + }, + { + "epoch": 1.7243353262943646, + "grad_norm": 1.6240688562393188, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.870141327381134, + "num_tokens": 517218455.0, + "step": 13555 + }, + { + "epoch": 1.7244625365729551, + "grad_norm": 1.4440884590148926, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8664624094963074, + "num_tokens": 517257590.0, + "step": 13556 + }, + { + "epoch": 1.7245897468515456, + "grad_norm": 1.3782247304916382, + "learning_rate": 1e-06, + "loss": 0.2768, + "mean_token_accuracy": 0.900538980960846, + "num_tokens": 517297361.0, + "step": 13557 + }, + { + "epoch": 1.724716957130136, + "grad_norm": 1.6802165508270264, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8736079931259155, + "num_tokens": 517328234.0, + "step": 13558 + }, + { + "epoch": 1.7248441674087265, + "grad_norm": 1.4399889707565308, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8685101270675659, + "num_tokens": 517370650.0, + "step": 13559 + }, + { + "epoch": 1.724971377687317, + "grad_norm": 1.5008803606033325, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8785423040390015, + "num_tokens": 517406554.0, + "step": 13560 + }, + { + "epoch": 1.7250985879659075, + "grad_norm": 1.4619979858398438, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8750458359718323, + "num_tokens": 517446586.0, + "step": 13561 + }, + { + "epoch": 1.725225798244498, + "grad_norm": 1.5206135511398315, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8873506784439087, + "num_tokens": 517481382.0, + "step": 13562 + }, + { + "epoch": 1.7253530085230886, + "grad_norm": 1.3610378503799438, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.89577317237854, + "num_tokens": 517521440.0, + "step": 13563 + }, + { + "epoch": 1.7254802188016791, + "grad_norm": 1.5755665302276611, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8893893957138062, + "num_tokens": 517554717.0, + "step": 13564 + }, + { + "epoch": 1.7256074290802697, + "grad_norm": 1.37718665599823, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8829559683799744, + "num_tokens": 517596301.0, + "step": 13565 + }, + { + "epoch": 1.7257346393588602, + "grad_norm": 1.492243766784668, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8840634226799011, + "num_tokens": 517633441.0, + "step": 13566 + }, + { + "epoch": 1.7258618496374507, + "grad_norm": 1.5951288938522339, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8750907778739929, + "num_tokens": 517667523.0, + "step": 13567 + }, + { + "epoch": 1.7259890599160412, + "grad_norm": 1.5937739610671997, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8630697727203369, + "num_tokens": 517702607.0, + "step": 13568 + }, + { + "epoch": 1.7261162701946318, + "grad_norm": 1.4469411373138428, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.884319543838501, + "num_tokens": 517741503.0, + "step": 13569 + }, + { + "epoch": 1.7262434804732223, + "grad_norm": 1.4456379413604736, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8698095083236694, + "num_tokens": 517783762.0, + "step": 13570 + }, + { + "epoch": 1.7263706907518128, + "grad_norm": 1.5087031126022339, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8835511803627014, + "num_tokens": 517820822.0, + "step": 13571 + }, + { + "epoch": 1.7264979010304033, + "grad_norm": 1.4520204067230225, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8774784207344055, + "num_tokens": 517861840.0, + "step": 13572 + }, + { + "epoch": 1.7266251113089939, + "grad_norm": 1.5371172428131104, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8810509443283081, + "num_tokens": 517895292.0, + "step": 13573 + }, + { + "epoch": 1.7267523215875844, + "grad_norm": 1.4770845174789429, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8863159418106079, + "num_tokens": 517935857.0, + "step": 13574 + }, + { + "epoch": 1.726879531866175, + "grad_norm": 1.4561407566070557, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8890639543533325, + "num_tokens": 517972170.0, + "step": 13575 + }, + { + "epoch": 1.7270067421447655, + "grad_norm": 1.5134941339492798, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8851343393325806, + "num_tokens": 518006577.0, + "step": 13576 + }, + { + "epoch": 1.7271339524233558, + "grad_norm": 1.4544168710708618, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.882758617401123, + "num_tokens": 518044834.0, + "step": 13577 + }, + { + "epoch": 1.7272611627019463, + "grad_norm": 1.6292223930358887, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8814988136291504, + "num_tokens": 518076790.0, + "step": 13578 + }, + { + "epoch": 1.7273883729805368, + "grad_norm": 1.4367671012878418, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8788861036300659, + "num_tokens": 518117558.0, + "step": 13579 + }, + { + "epoch": 1.7275155832591274, + "grad_norm": 1.4060684442520142, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8825464248657227, + "num_tokens": 518162247.0, + "step": 13580 + }, + { + "epoch": 1.7276427935377179, + "grad_norm": 1.5890657901763916, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8753934502601624, + "num_tokens": 518197004.0, + "step": 13581 + }, + { + "epoch": 1.7277700038163084, + "grad_norm": 1.4753810167312622, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.886072039604187, + "num_tokens": 518233751.0, + "step": 13582 + }, + { + "epoch": 1.7278972140948987, + "grad_norm": 1.4018598794937134, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8783091306686401, + "num_tokens": 518277836.0, + "step": 13583 + }, + { + "epoch": 1.7280244243734892, + "grad_norm": 1.6313589811325073, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8793876767158508, + "num_tokens": 518311779.0, + "step": 13584 + }, + { + "epoch": 1.7281516346520798, + "grad_norm": 1.5864781141281128, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.880991518497467, + "num_tokens": 518350557.0, + "step": 13585 + }, + { + "epoch": 1.7282788449306703, + "grad_norm": 1.5544378757476807, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.879069983959198, + "num_tokens": 518385014.0, + "step": 13586 + }, + { + "epoch": 1.7284060552092608, + "grad_norm": 1.466177225112915, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8648484945297241, + "num_tokens": 518425513.0, + "step": 13587 + }, + { + "epoch": 1.7285332654878514, + "grad_norm": 1.5079867839813232, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8846825361251831, + "num_tokens": 518460376.0, + "step": 13588 + }, + { + "epoch": 1.7286604757664419, + "grad_norm": 1.6330578327178955, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8816710114479065, + "num_tokens": 518498179.0, + "step": 13589 + }, + { + "epoch": 1.7287876860450324, + "grad_norm": 1.532210350036621, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.876191258430481, + "num_tokens": 518539339.0, + "step": 13590 + }, + { + "epoch": 1.728914896323623, + "grad_norm": 1.5491200685501099, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8825072050094604, + "num_tokens": 518572258.0, + "step": 13591 + }, + { + "epoch": 1.7290421066022135, + "grad_norm": 1.4451245069503784, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8745241165161133, + "num_tokens": 518617221.0, + "step": 13592 + }, + { + "epoch": 1.729169316880804, + "grad_norm": 1.3928524255752563, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8852848410606384, + "num_tokens": 518659125.0, + "step": 13593 + }, + { + "epoch": 1.7292965271593945, + "grad_norm": 1.6110440492630005, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8728744983673096, + "num_tokens": 518695814.0, + "step": 13594 + }, + { + "epoch": 1.729423737437985, + "grad_norm": 1.6387734413146973, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8759201765060425, + "num_tokens": 518729263.0, + "step": 13595 + }, + { + "epoch": 1.7295509477165756, + "grad_norm": 1.487431287765503, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8833588361740112, + "num_tokens": 518766190.0, + "step": 13596 + }, + { + "epoch": 1.729678157995166, + "grad_norm": 1.422561526298523, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8826747536659241, + "num_tokens": 518806149.0, + "step": 13597 + }, + { + "epoch": 1.7298053682737566, + "grad_norm": 1.4351214170455933, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8782533407211304, + "num_tokens": 518846920.0, + "step": 13598 + }, + { + "epoch": 1.7299325785523472, + "grad_norm": 1.4881418943405151, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8811817169189453, + "num_tokens": 518885431.0, + "step": 13599 + }, + { + "epoch": 1.7300597888309377, + "grad_norm": 1.4471712112426758, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8779127597808838, + "num_tokens": 518920713.0, + "step": 13600 + }, + { + "epoch": 1.730186999109528, + "grad_norm": 1.4227817058563232, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8852056264877319, + "num_tokens": 518959940.0, + "step": 13601 + }, + { + "epoch": 1.7303142093881185, + "grad_norm": 1.5380579233169556, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.884769082069397, + "num_tokens": 518999381.0, + "step": 13602 + }, + { + "epoch": 1.730441419666709, + "grad_norm": 1.5106624364852905, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8854129910469055, + "num_tokens": 519037723.0, + "step": 13603 + }, + { + "epoch": 1.7305686299452996, + "grad_norm": 1.4417086839675903, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8875489234924316, + "num_tokens": 519076328.0, + "step": 13604 + }, + { + "epoch": 1.7306958402238901, + "grad_norm": 1.3649221658706665, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8811897039413452, + "num_tokens": 519118200.0, + "step": 13605 + }, + { + "epoch": 1.7308230505024806, + "grad_norm": 1.3909153938293457, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8727936148643494, + "num_tokens": 519162078.0, + "step": 13606 + }, + { + "epoch": 1.730950260781071, + "grad_norm": 1.5209068059921265, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8751344680786133, + "num_tokens": 519199964.0, + "step": 13607 + }, + { + "epoch": 1.7310774710596615, + "grad_norm": 1.408454418182373, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8895412683486938, + "num_tokens": 519240419.0, + "step": 13608 + }, + { + "epoch": 1.731204681338252, + "grad_norm": 1.483833909034729, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8787424564361572, + "num_tokens": 519279681.0, + "step": 13609 + }, + { + "epoch": 1.7313318916168425, + "grad_norm": 1.4303061962127686, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8767589926719666, + "num_tokens": 519321780.0, + "step": 13610 + }, + { + "epoch": 1.731459101895433, + "grad_norm": 1.5592262744903564, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8772567510604858, + "num_tokens": 519359983.0, + "step": 13611 + }, + { + "epoch": 1.7315863121740236, + "grad_norm": 1.4070868492126465, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8793869018554688, + "num_tokens": 519399741.0, + "step": 13612 + }, + { + "epoch": 1.7317135224526141, + "grad_norm": 1.5848212242126465, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8867945075035095, + "num_tokens": 519433708.0, + "step": 13613 + }, + { + "epoch": 1.7318407327312046, + "grad_norm": 1.519267201423645, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8748143315315247, + "num_tokens": 519473827.0, + "step": 13614 + }, + { + "epoch": 1.7319679430097952, + "grad_norm": 1.5182827711105347, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8862665891647339, + "num_tokens": 519508289.0, + "step": 13615 + }, + { + "epoch": 1.7320951532883857, + "grad_norm": 1.5574939250946045, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8722503185272217, + "num_tokens": 519548259.0, + "step": 13616 + }, + { + "epoch": 1.7322223635669762, + "grad_norm": 1.4920955896377563, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.888032853603363, + "num_tokens": 519584413.0, + "step": 13617 + }, + { + "epoch": 1.7323495738455668, + "grad_norm": 1.5031278133392334, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8725715279579163, + "num_tokens": 519628161.0, + "step": 13618 + }, + { + "epoch": 1.7324767841241573, + "grad_norm": 1.526503086090088, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8729729652404785, + "num_tokens": 519667148.0, + "step": 13619 + }, + { + "epoch": 1.7326039944027478, + "grad_norm": 1.6156293153762817, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8710132837295532, + "num_tokens": 519705308.0, + "step": 13620 + }, + { + "epoch": 1.7327312046813383, + "grad_norm": 1.4224117994308472, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8761790990829468, + "num_tokens": 519748244.0, + "step": 13621 + }, + { + "epoch": 1.7328584149599289, + "grad_norm": 1.6269707679748535, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8695545196533203, + "num_tokens": 519783072.0, + "step": 13622 + }, + { + "epoch": 1.7329856252385194, + "grad_norm": 1.4004340171813965, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8783086538314819, + "num_tokens": 519825050.0, + "step": 13623 + }, + { + "epoch": 1.73311283551711, + "grad_norm": 1.506964921951294, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.870647132396698, + "num_tokens": 519860687.0, + "step": 13624 + }, + { + "epoch": 1.7332400457957005, + "grad_norm": 1.460614800453186, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8880519866943359, + "num_tokens": 519894019.0, + "step": 13625 + }, + { + "epoch": 1.7333672560742908, + "grad_norm": 1.5082122087478638, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8857948780059814, + "num_tokens": 519931040.0, + "step": 13626 + }, + { + "epoch": 1.7334944663528813, + "grad_norm": 1.4569092988967896, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8727443218231201, + "num_tokens": 519971311.0, + "step": 13627 + }, + { + "epoch": 1.7336216766314718, + "grad_norm": 1.568625569343567, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8808727264404297, + "num_tokens": 520007185.0, + "step": 13628 + }, + { + "epoch": 1.7337488869100623, + "grad_norm": 1.5571271181106567, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8631699085235596, + "num_tokens": 520047889.0, + "step": 13629 + }, + { + "epoch": 1.7338760971886529, + "grad_norm": 1.4378294944763184, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8877048492431641, + "num_tokens": 520085165.0, + "step": 13630 + }, + { + "epoch": 1.7340033074672434, + "grad_norm": 1.461743950843811, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8737196922302246, + "num_tokens": 520125280.0, + "step": 13631 + }, + { + "epoch": 1.7341305177458337, + "grad_norm": 1.5247725248336792, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8551839590072632, + "num_tokens": 520161658.0, + "step": 13632 + }, + { + "epoch": 1.7342577280244242, + "grad_norm": 1.6452672481536865, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8666783571243286, + "num_tokens": 520197130.0, + "step": 13633 + }, + { + "epoch": 1.7343849383030148, + "grad_norm": 1.5251407623291016, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8703447580337524, + "num_tokens": 520236190.0, + "step": 13634 + }, + { + "epoch": 1.7345121485816053, + "grad_norm": 1.4419136047363281, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8853721618652344, + "num_tokens": 520273580.0, + "step": 13635 + }, + { + "epoch": 1.7346393588601958, + "grad_norm": 1.4567780494689941, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8778706789016724, + "num_tokens": 520316424.0, + "step": 13636 + }, + { + "epoch": 1.7347665691387864, + "grad_norm": 1.6864145994186401, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8759986162185669, + "num_tokens": 520352870.0, + "step": 13637 + }, + { + "epoch": 1.7348937794173769, + "grad_norm": 1.394201397895813, + "learning_rate": 1e-06, + "loss": 0.2823, + "mean_token_accuracy": 0.8966016173362732, + "num_tokens": 520387724.0, + "step": 13638 + }, + { + "epoch": 1.7350209896959674, + "grad_norm": 1.380821943283081, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8764812350273132, + "num_tokens": 520432885.0, + "step": 13639 + }, + { + "epoch": 1.735148199974558, + "grad_norm": 1.6118619441986084, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8639391660690308, + "num_tokens": 520468752.0, + "step": 13640 + }, + { + "epoch": 1.7352754102531485, + "grad_norm": 1.5735198259353638, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8743772506713867, + "num_tokens": 520504687.0, + "step": 13641 + }, + { + "epoch": 1.735402620531739, + "grad_norm": 1.6295223236083984, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8736540079116821, + "num_tokens": 520538257.0, + "step": 13642 + }, + { + "epoch": 1.7355298308103295, + "grad_norm": 1.5529696941375732, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8749032020568848, + "num_tokens": 520576416.0, + "step": 13643 + }, + { + "epoch": 1.73565704108892, + "grad_norm": 1.4774599075317383, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8873180150985718, + "num_tokens": 520612554.0, + "step": 13644 + }, + { + "epoch": 1.7357842513675106, + "grad_norm": 1.6185702085494995, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8725441694259644, + "num_tokens": 520647797.0, + "step": 13645 + }, + { + "epoch": 1.735911461646101, + "grad_norm": 1.491767168045044, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.882831335067749, + "num_tokens": 520686193.0, + "step": 13646 + }, + { + "epoch": 1.7360386719246916, + "grad_norm": 1.5009981393814087, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8661601543426514, + "num_tokens": 520723997.0, + "step": 13647 + }, + { + "epoch": 1.7361658822032822, + "grad_norm": 1.4785605669021606, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8946994543075562, + "num_tokens": 520761294.0, + "step": 13648 + }, + { + "epoch": 1.7362930924818727, + "grad_norm": 1.4826897382736206, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8838833570480347, + "num_tokens": 520800488.0, + "step": 13649 + }, + { + "epoch": 1.736420302760463, + "grad_norm": 1.4446195363998413, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8883838653564453, + "num_tokens": 520838349.0, + "step": 13650 + }, + { + "epoch": 1.7365475130390535, + "grad_norm": 1.48544180393219, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.885743260383606, + "num_tokens": 520875181.0, + "step": 13651 + }, + { + "epoch": 1.736674723317644, + "grad_norm": 1.4861725568771362, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.865110456943512, + "num_tokens": 520914715.0, + "step": 13652 + }, + { + "epoch": 1.7368019335962346, + "grad_norm": 1.5301614999771118, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8736672401428223, + "num_tokens": 520954742.0, + "step": 13653 + }, + { + "epoch": 1.736929143874825, + "grad_norm": 1.527799367904663, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8858175277709961, + "num_tokens": 520995327.0, + "step": 13654 + }, + { + "epoch": 1.7370563541534156, + "grad_norm": 1.5271199941635132, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8841313719749451, + "num_tokens": 521031919.0, + "step": 13655 + }, + { + "epoch": 1.737183564432006, + "grad_norm": 1.631906270980835, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8719385862350464, + "num_tokens": 521064465.0, + "step": 13656 + }, + { + "epoch": 1.7373107747105965, + "grad_norm": 1.5487421751022339, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8787816762924194, + "num_tokens": 521101099.0, + "step": 13657 + }, + { + "epoch": 1.737437984989187, + "grad_norm": 1.5027885437011719, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8714737892150879, + "num_tokens": 521139987.0, + "step": 13658 + }, + { + "epoch": 1.7375651952677775, + "grad_norm": 1.4664896726608276, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8711214065551758, + "num_tokens": 521181692.0, + "step": 13659 + }, + { + "epoch": 1.737692405546368, + "grad_norm": 1.3882098197937012, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8842647075653076, + "num_tokens": 521225639.0, + "step": 13660 + }, + { + "epoch": 1.7378196158249586, + "grad_norm": 1.3983708620071411, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8826782703399658, + "num_tokens": 521264733.0, + "step": 13661 + }, + { + "epoch": 1.7379468261035491, + "grad_norm": 1.4781264066696167, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8856837749481201, + "num_tokens": 521299234.0, + "step": 13662 + }, + { + "epoch": 1.7380740363821396, + "grad_norm": 1.4940520524978638, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8805563449859619, + "num_tokens": 521337987.0, + "step": 13663 + }, + { + "epoch": 1.7382012466607302, + "grad_norm": 1.555766224861145, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.883792519569397, + "num_tokens": 521377255.0, + "step": 13664 + }, + { + "epoch": 1.7383284569393207, + "grad_norm": 1.4995577335357666, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8683116436004639, + "num_tokens": 521417767.0, + "step": 13665 + }, + { + "epoch": 1.7384556672179112, + "grad_norm": 1.6211977005004883, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.878294050693512, + "num_tokens": 521453424.0, + "step": 13666 + }, + { + "epoch": 1.7385828774965018, + "grad_norm": 1.4000303745269775, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8790227174758911, + "num_tokens": 521492091.0, + "step": 13667 + }, + { + "epoch": 1.7387100877750923, + "grad_norm": 1.4328532218933105, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8868542313575745, + "num_tokens": 521531148.0, + "step": 13668 + }, + { + "epoch": 1.7388372980536828, + "grad_norm": 1.3440330028533936, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8760305643081665, + "num_tokens": 521578037.0, + "step": 13669 + }, + { + "epoch": 1.7389645083322733, + "grad_norm": 1.3184731006622314, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.875838041305542, + "num_tokens": 521623605.0, + "step": 13670 + }, + { + "epoch": 1.7390917186108639, + "grad_norm": 1.4479504823684692, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8652452826499939, + "num_tokens": 521671023.0, + "step": 13671 + }, + { + "epoch": 1.7392189288894544, + "grad_norm": 1.504494309425354, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.874048113822937, + "num_tokens": 521711465.0, + "step": 13672 + }, + { + "epoch": 1.739346139168045, + "grad_norm": 1.389007568359375, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8898400068283081, + "num_tokens": 521751675.0, + "step": 13673 + }, + { + "epoch": 1.7394733494466355, + "grad_norm": 1.4340606927871704, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8613903522491455, + "num_tokens": 521798442.0, + "step": 13674 + }, + { + "epoch": 1.7396005597252258, + "grad_norm": 1.6438804864883423, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.883650541305542, + "num_tokens": 521829073.0, + "step": 13675 + }, + { + "epoch": 1.7397277700038163, + "grad_norm": 1.4458519220352173, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.871812105178833, + "num_tokens": 521869254.0, + "step": 13676 + }, + { + "epoch": 1.7398549802824068, + "grad_norm": 1.537221074104309, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8795067071914673, + "num_tokens": 521902220.0, + "step": 13677 + }, + { + "epoch": 1.7399821905609973, + "grad_norm": 1.4762834310531616, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8774487972259521, + "num_tokens": 521940269.0, + "step": 13678 + }, + { + "epoch": 1.7401094008395879, + "grad_norm": 1.5060465335845947, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8687456250190735, + "num_tokens": 521979874.0, + "step": 13679 + }, + { + "epoch": 1.7402366111181784, + "grad_norm": 1.436331868171692, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8841920495033264, + "num_tokens": 522019309.0, + "step": 13680 + }, + { + "epoch": 1.7403638213967687, + "grad_norm": 1.5422011613845825, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8696532249450684, + "num_tokens": 522058483.0, + "step": 13681 + }, + { + "epoch": 1.7404910316753592, + "grad_norm": 1.347184181213379, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8850671052932739, + "num_tokens": 522102603.0, + "step": 13682 + }, + { + "epoch": 1.7406182419539498, + "grad_norm": 1.5551481246948242, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8806406855583191, + "num_tokens": 522139057.0, + "step": 13683 + }, + { + "epoch": 1.7407454522325403, + "grad_norm": 1.5065182447433472, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8671869039535522, + "num_tokens": 522181365.0, + "step": 13684 + }, + { + "epoch": 1.7408726625111308, + "grad_norm": 1.6035665273666382, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.867741584777832, + "num_tokens": 522216599.0, + "step": 13685 + }, + { + "epoch": 1.7409998727897213, + "grad_norm": 1.5248453617095947, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8696117401123047, + "num_tokens": 522256229.0, + "step": 13686 + }, + { + "epoch": 1.7411270830683119, + "grad_norm": 1.4937801361083984, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8739324808120728, + "num_tokens": 522294459.0, + "step": 13687 + }, + { + "epoch": 1.7412542933469024, + "grad_norm": 1.5127506256103516, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8853646516799927, + "num_tokens": 522328831.0, + "step": 13688 + }, + { + "epoch": 1.741381503625493, + "grad_norm": 1.4554282426834106, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.869308590888977, + "num_tokens": 522372597.0, + "step": 13689 + }, + { + "epoch": 1.7415087139040835, + "grad_norm": 1.533390998840332, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8761523962020874, + "num_tokens": 522408968.0, + "step": 13690 + }, + { + "epoch": 1.741635924182674, + "grad_norm": 1.4786186218261719, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8797976970672607, + "num_tokens": 522449483.0, + "step": 13691 + }, + { + "epoch": 1.7417631344612645, + "grad_norm": 1.4332197904586792, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8872483968734741, + "num_tokens": 522490587.0, + "step": 13692 + }, + { + "epoch": 1.741890344739855, + "grad_norm": 1.3673069477081299, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8767595887184143, + "num_tokens": 522533582.0, + "step": 13693 + }, + { + "epoch": 1.7420175550184456, + "grad_norm": 1.5146554708480835, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8764711618423462, + "num_tokens": 522570471.0, + "step": 13694 + }, + { + "epoch": 1.742144765297036, + "grad_norm": 1.4417543411254883, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8916856050491333, + "num_tokens": 522606486.0, + "step": 13695 + }, + { + "epoch": 1.7422719755756266, + "grad_norm": 1.4750547409057617, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8622099161148071, + "num_tokens": 522649114.0, + "step": 13696 + }, + { + "epoch": 1.7423991858542172, + "grad_norm": 1.5833661556243896, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8830084800720215, + "num_tokens": 522682027.0, + "step": 13697 + }, + { + "epoch": 1.7425263961328077, + "grad_norm": 1.4108132123947144, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.884972333908081, + "num_tokens": 522722201.0, + "step": 13698 + }, + { + "epoch": 1.742653606411398, + "grad_norm": 1.496779203414917, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.878764808177948, + "num_tokens": 522761652.0, + "step": 13699 + }, + { + "epoch": 1.7427808166899885, + "grad_norm": 1.5055662393569946, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.874234676361084, + "num_tokens": 522799688.0, + "step": 13700 + }, + { + "epoch": 1.742908026968579, + "grad_norm": 1.6123721599578857, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8796846270561218, + "num_tokens": 522832880.0, + "step": 13701 + }, + { + "epoch": 1.7430352372471696, + "grad_norm": 1.502183198928833, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.867679238319397, + "num_tokens": 522872078.0, + "step": 13702 + }, + { + "epoch": 1.74316244752576, + "grad_norm": 1.4042913913726807, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8854901194572449, + "num_tokens": 522911716.0, + "step": 13703 + }, + { + "epoch": 1.7432896578043506, + "grad_norm": 1.4886442422866821, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8797536492347717, + "num_tokens": 522951672.0, + "step": 13704 + }, + { + "epoch": 1.743416868082941, + "grad_norm": 1.5222980976104736, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8787294030189514, + "num_tokens": 522986790.0, + "step": 13705 + }, + { + "epoch": 1.7435440783615315, + "grad_norm": 1.6183130741119385, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8766792416572571, + "num_tokens": 523018407.0, + "step": 13706 + }, + { + "epoch": 1.743671288640122, + "grad_norm": 1.5395995378494263, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8736846446990967, + "num_tokens": 523054906.0, + "step": 13707 + }, + { + "epoch": 1.7437984989187125, + "grad_norm": 1.4827494621276855, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8657968640327454, + "num_tokens": 523095880.0, + "step": 13708 + }, + { + "epoch": 1.743925709197303, + "grad_norm": 1.682209849357605, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8723626136779785, + "num_tokens": 523135344.0, + "step": 13709 + }, + { + "epoch": 1.7440529194758936, + "grad_norm": 1.4160799980163574, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8777933716773987, + "num_tokens": 523175847.0, + "step": 13710 + }, + { + "epoch": 1.744180129754484, + "grad_norm": 1.3911068439483643, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8749106526374817, + "num_tokens": 523218877.0, + "step": 13711 + }, + { + "epoch": 1.7443073400330746, + "grad_norm": 1.5274544954299927, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8598247170448303, + "num_tokens": 523260592.0, + "step": 13712 + }, + { + "epoch": 1.7444345503116652, + "grad_norm": 1.4790271520614624, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8954899311065674, + "num_tokens": 523295930.0, + "step": 13713 + }, + { + "epoch": 1.7445617605902557, + "grad_norm": 1.342107892036438, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8855124711990356, + "num_tokens": 523341552.0, + "step": 13714 + }, + { + "epoch": 1.7446889708688462, + "grad_norm": 1.4505363702774048, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.871121346950531, + "num_tokens": 523382323.0, + "step": 13715 + }, + { + "epoch": 1.7448161811474368, + "grad_norm": 1.420754075050354, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8858088254928589, + "num_tokens": 523423140.0, + "step": 13716 + }, + { + "epoch": 1.7449433914260273, + "grad_norm": 1.5475903749465942, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8729363679885864, + "num_tokens": 523459763.0, + "step": 13717 + }, + { + "epoch": 1.7450706017046178, + "grad_norm": 1.654992938041687, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8699980974197388, + "num_tokens": 523492629.0, + "step": 13718 + }, + { + "epoch": 1.7451978119832083, + "grad_norm": 1.4476914405822754, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8684027791023254, + "num_tokens": 523536699.0, + "step": 13719 + }, + { + "epoch": 1.7453250222617989, + "grad_norm": 1.5811773538589478, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8654002547264099, + "num_tokens": 523574417.0, + "step": 13720 + }, + { + "epoch": 1.7454522325403894, + "grad_norm": 1.4647597074508667, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8820425271987915, + "num_tokens": 523613365.0, + "step": 13721 + }, + { + "epoch": 1.74557944281898, + "grad_norm": 1.502906322479248, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8761005401611328, + "num_tokens": 523650758.0, + "step": 13722 + }, + { + "epoch": 1.7457066530975704, + "grad_norm": 1.495679259300232, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8889757394790649, + "num_tokens": 523685546.0, + "step": 13723 + }, + { + "epoch": 1.7458338633761608, + "grad_norm": 1.439206838607788, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8843287229537964, + "num_tokens": 523723566.0, + "step": 13724 + }, + { + "epoch": 1.7459610736547513, + "grad_norm": 1.548691987991333, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8809457421302795, + "num_tokens": 523758133.0, + "step": 13725 + }, + { + "epoch": 1.7460882839333418, + "grad_norm": 1.583440899848938, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.883967399597168, + "num_tokens": 523796165.0, + "step": 13726 + }, + { + "epoch": 1.7462154942119323, + "grad_norm": 1.402866244316101, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.888027548789978, + "num_tokens": 523836765.0, + "step": 13727 + }, + { + "epoch": 1.7463427044905229, + "grad_norm": 1.510252833366394, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8736625909805298, + "num_tokens": 523875067.0, + "step": 13728 + }, + { + "epoch": 1.7464699147691134, + "grad_norm": 1.605776309967041, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8623420000076294, + "num_tokens": 523913995.0, + "step": 13729 + }, + { + "epoch": 1.7465971250477037, + "grad_norm": 1.2956589460372925, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8907843232154846, + "num_tokens": 523956969.0, + "step": 13730 + }, + { + "epoch": 1.7467243353262942, + "grad_norm": 1.4420933723449707, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8749109506607056, + "num_tokens": 523994450.0, + "step": 13731 + }, + { + "epoch": 1.7468515456048848, + "grad_norm": 1.4853565692901611, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8763359189033508, + "num_tokens": 524036946.0, + "step": 13732 + }, + { + "epoch": 1.7469787558834753, + "grad_norm": 1.5522946119308472, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8792296648025513, + "num_tokens": 524071202.0, + "step": 13733 + }, + { + "epoch": 1.7471059661620658, + "grad_norm": 1.408212423324585, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8730827569961548, + "num_tokens": 524113282.0, + "step": 13734 + }, + { + "epoch": 1.7472331764406563, + "grad_norm": 1.4103703498840332, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8889399170875549, + "num_tokens": 524150661.0, + "step": 13735 + }, + { + "epoch": 1.7473603867192469, + "grad_norm": 1.5848973989486694, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8901315331459045, + "num_tokens": 524181680.0, + "step": 13736 + }, + { + "epoch": 1.7474875969978374, + "grad_norm": 1.357369065284729, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8770963549613953, + "num_tokens": 524225930.0, + "step": 13737 + }, + { + "epoch": 1.747614807276428, + "grad_norm": 1.4917614459991455, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8871665596961975, + "num_tokens": 524262463.0, + "step": 13738 + }, + { + "epoch": 1.7477420175550185, + "grad_norm": 1.5795044898986816, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.876227855682373, + "num_tokens": 524300399.0, + "step": 13739 + }, + { + "epoch": 1.747869227833609, + "grad_norm": 1.4802674055099487, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8740703463554382, + "num_tokens": 524337430.0, + "step": 13740 + }, + { + "epoch": 1.7479964381121995, + "grad_norm": 1.4542046785354614, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8760623931884766, + "num_tokens": 524378341.0, + "step": 13741 + }, + { + "epoch": 1.74812364839079, + "grad_norm": 1.5286500453948975, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.878872275352478, + "num_tokens": 524415615.0, + "step": 13742 + }, + { + "epoch": 1.7482508586693806, + "grad_norm": 1.4806569814682007, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8802465796470642, + "num_tokens": 524458335.0, + "step": 13743 + }, + { + "epoch": 1.748378068947971, + "grad_norm": 1.388765811920166, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8830627202987671, + "num_tokens": 524499564.0, + "step": 13744 + }, + { + "epoch": 1.7485052792265616, + "grad_norm": 1.2957388162612915, + "learning_rate": 1e-06, + "loss": 0.2629, + "mean_token_accuracy": 0.9038521647453308, + "num_tokens": 524539259.0, + "step": 13745 + }, + { + "epoch": 1.7486324895051522, + "grad_norm": 1.3082388639450073, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8945091962814331, + "num_tokens": 524582325.0, + "step": 13746 + }, + { + "epoch": 1.7487596997837427, + "grad_norm": 1.3934624195098877, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8835077285766602, + "num_tokens": 524625875.0, + "step": 13747 + }, + { + "epoch": 1.748886910062333, + "grad_norm": 1.338134527206421, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8907337188720703, + "num_tokens": 524669216.0, + "step": 13748 + }, + { + "epoch": 1.7490141203409235, + "grad_norm": 1.4918586015701294, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8711575865745544, + "num_tokens": 524709991.0, + "step": 13749 + }, + { + "epoch": 1.749141330619514, + "grad_norm": 1.631135106086731, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8663251996040344, + "num_tokens": 524747418.0, + "step": 13750 + }, + { + "epoch": 1.7492685408981046, + "grad_norm": 1.3960156440734863, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8822327852249146, + "num_tokens": 524787298.0, + "step": 13751 + }, + { + "epoch": 1.749395751176695, + "grad_norm": 1.4687588214874268, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8741260170936584, + "num_tokens": 524826222.0, + "step": 13752 + }, + { + "epoch": 1.7495229614552856, + "grad_norm": 1.3418407440185547, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8897567391395569, + "num_tokens": 524868067.0, + "step": 13753 + }, + { + "epoch": 1.749650171733876, + "grad_norm": 1.4974161386489868, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8603005409240723, + "num_tokens": 524910047.0, + "step": 13754 + }, + { + "epoch": 1.7497773820124665, + "grad_norm": 1.4614959955215454, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8748009204864502, + "num_tokens": 524945762.0, + "step": 13755 + }, + { + "epoch": 1.749904592291057, + "grad_norm": 1.4739198684692383, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8737570643424988, + "num_tokens": 524985603.0, + "step": 13756 + }, + { + "epoch": 1.7500318025696475, + "grad_norm": 1.3601289987564087, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8883615732192993, + "num_tokens": 525026261.0, + "step": 13757 + }, + { + "epoch": 1.750159012848238, + "grad_norm": 1.5253369808197021, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8591460585594177, + "num_tokens": 525069340.0, + "step": 13758 + }, + { + "epoch": 1.7502862231268286, + "grad_norm": 1.4319552183151245, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8858752250671387, + "num_tokens": 525105527.0, + "step": 13759 + }, + { + "epoch": 1.750413433405419, + "grad_norm": 1.5477839708328247, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.887491762638092, + "num_tokens": 525138070.0, + "step": 13760 + }, + { + "epoch": 1.7505406436840096, + "grad_norm": 1.5808134078979492, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8754457831382751, + "num_tokens": 525174030.0, + "step": 13761 + }, + { + "epoch": 1.7506678539626002, + "grad_norm": 1.5533759593963623, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8703522682189941, + "num_tokens": 525209758.0, + "step": 13762 + }, + { + "epoch": 1.7507950642411907, + "grad_norm": 1.7307037115097046, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8769917488098145, + "num_tokens": 525237713.0, + "step": 13763 + }, + { + "epoch": 1.7509222745197812, + "grad_norm": 1.4122984409332275, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8680717945098877, + "num_tokens": 525283869.0, + "step": 13764 + }, + { + "epoch": 1.7510494847983717, + "grad_norm": 1.4476286172866821, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8770526647567749, + "num_tokens": 525327102.0, + "step": 13765 + }, + { + "epoch": 1.7511766950769623, + "grad_norm": 1.6283643245697021, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8846476674079895, + "num_tokens": 525360885.0, + "step": 13766 + }, + { + "epoch": 1.7513039053555528, + "grad_norm": 1.603866696357727, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8781114816665649, + "num_tokens": 525396985.0, + "step": 13767 + }, + { + "epoch": 1.7514311156341433, + "grad_norm": 1.5215117931365967, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8776413202285767, + "num_tokens": 525432690.0, + "step": 13768 + }, + { + "epoch": 1.7515583259127339, + "grad_norm": 1.4725912809371948, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8774235248565674, + "num_tokens": 525471733.0, + "step": 13769 + }, + { + "epoch": 1.7516855361913244, + "grad_norm": 1.486043095588684, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8925408124923706, + "num_tokens": 525507739.0, + "step": 13770 + }, + { + "epoch": 1.751812746469915, + "grad_norm": 1.4719460010528564, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8884479403495789, + "num_tokens": 525546735.0, + "step": 13771 + }, + { + "epoch": 1.7519399567485054, + "grad_norm": 1.3914791345596313, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8785712122917175, + "num_tokens": 525590740.0, + "step": 13772 + }, + { + "epoch": 1.7520671670270958, + "grad_norm": 1.5468374490737915, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8846030235290527, + "num_tokens": 525624768.0, + "step": 13773 + }, + { + "epoch": 1.7521943773056863, + "grad_norm": 1.4917283058166504, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8645033836364746, + "num_tokens": 525663368.0, + "step": 13774 + }, + { + "epoch": 1.7523215875842768, + "grad_norm": 1.6101144552230835, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8809030055999756, + "num_tokens": 525700983.0, + "step": 13775 + }, + { + "epoch": 1.7524487978628673, + "grad_norm": 1.455722689628601, + "learning_rate": 1e-06, + "loss": 0.277, + "mean_token_accuracy": 0.8999005556106567, + "num_tokens": 525738578.0, + "step": 13776 + }, + { + "epoch": 1.7525760081414579, + "grad_norm": 1.568062424659729, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8702754378318787, + "num_tokens": 525780739.0, + "step": 13777 + }, + { + "epoch": 1.7527032184200484, + "grad_norm": 1.5089632272720337, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8638217449188232, + "num_tokens": 525823219.0, + "step": 13778 + }, + { + "epoch": 1.7528304286986387, + "grad_norm": 1.4752122163772583, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.881950855255127, + "num_tokens": 525861090.0, + "step": 13779 + }, + { + "epoch": 1.7529576389772292, + "grad_norm": 1.5128402709960938, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8851844072341919, + "num_tokens": 525898531.0, + "step": 13780 + }, + { + "epoch": 1.7530848492558198, + "grad_norm": 1.6830672025680542, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8757377862930298, + "num_tokens": 525929345.0, + "step": 13781 + }, + { + "epoch": 1.7532120595344103, + "grad_norm": 1.52235746383667, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8775233030319214, + "num_tokens": 525967916.0, + "step": 13782 + }, + { + "epoch": 1.7533392698130008, + "grad_norm": 1.5387533903121948, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8858152627944946, + "num_tokens": 526004350.0, + "step": 13783 + }, + { + "epoch": 1.7534664800915913, + "grad_norm": 1.566904902458191, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.856468677520752, + "num_tokens": 526044768.0, + "step": 13784 + }, + { + "epoch": 1.7535936903701819, + "grad_norm": 1.4656015634536743, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.874243438243866, + "num_tokens": 526086233.0, + "step": 13785 + }, + { + "epoch": 1.7537209006487724, + "grad_norm": 1.487322449684143, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8800445795059204, + "num_tokens": 526123724.0, + "step": 13786 + }, + { + "epoch": 1.753848110927363, + "grad_norm": 1.4665695428848267, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8776484131813049, + "num_tokens": 526162062.0, + "step": 13787 + }, + { + "epoch": 1.7539753212059535, + "grad_norm": 1.4249498844146729, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8745571374893188, + "num_tokens": 526201521.0, + "step": 13788 + }, + { + "epoch": 1.754102531484544, + "grad_norm": 1.5460444688796997, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8762983083724976, + "num_tokens": 526238371.0, + "step": 13789 + }, + { + "epoch": 1.7542297417631345, + "grad_norm": 2.0427405834198, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8802094459533691, + "num_tokens": 526277904.0, + "step": 13790 + }, + { + "epoch": 1.754356952041725, + "grad_norm": 1.395769715309143, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8788657188415527, + "num_tokens": 526316590.0, + "step": 13791 + }, + { + "epoch": 1.7544841623203156, + "grad_norm": 1.4518781900405884, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8765033483505249, + "num_tokens": 526356669.0, + "step": 13792 + }, + { + "epoch": 1.754611372598906, + "grad_norm": 1.4934046268463135, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8879657983779907, + "num_tokens": 526393639.0, + "step": 13793 + }, + { + "epoch": 1.7547385828774966, + "grad_norm": 1.4973191022872925, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8880517482757568, + "num_tokens": 526427790.0, + "step": 13794 + }, + { + "epoch": 1.7548657931560872, + "grad_norm": 1.4504642486572266, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8774552345275879, + "num_tokens": 526467852.0, + "step": 13795 + }, + { + "epoch": 1.7549930034346777, + "grad_norm": 1.4276249408721924, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8736428022384644, + "num_tokens": 526509107.0, + "step": 13796 + }, + { + "epoch": 1.755120213713268, + "grad_norm": 1.4852194786071777, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8695440888404846, + "num_tokens": 526551253.0, + "step": 13797 + }, + { + "epoch": 1.7552474239918585, + "grad_norm": 1.5229222774505615, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8724967241287231, + "num_tokens": 526589571.0, + "step": 13798 + }, + { + "epoch": 1.755374634270449, + "grad_norm": 1.5507550239562988, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8865008354187012, + "num_tokens": 526627670.0, + "step": 13799 + }, + { + "epoch": 1.7555018445490396, + "grad_norm": 1.4414514303207397, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8769630193710327, + "num_tokens": 526666461.0, + "step": 13800 + }, + { + "epoch": 1.75562905482763, + "grad_norm": 1.3972177505493164, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8850435018539429, + "num_tokens": 526704544.0, + "step": 13801 + }, + { + "epoch": 1.7557562651062206, + "grad_norm": 1.362502098083496, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8806743621826172, + "num_tokens": 526747128.0, + "step": 13802 + }, + { + "epoch": 1.755883475384811, + "grad_norm": 1.5474685430526733, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.887634813785553, + "num_tokens": 526782058.0, + "step": 13803 + }, + { + "epoch": 1.7560106856634015, + "grad_norm": 1.536230444908142, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.861698567867279, + "num_tokens": 526824301.0, + "step": 13804 + }, + { + "epoch": 1.756137895941992, + "grad_norm": 1.4764233827590942, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8688532114028931, + "num_tokens": 526862568.0, + "step": 13805 + }, + { + "epoch": 1.7562651062205825, + "grad_norm": 1.5472619533538818, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.865861177444458, + "num_tokens": 526903305.0, + "step": 13806 + }, + { + "epoch": 1.756392316499173, + "grad_norm": 1.4709339141845703, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8761488199234009, + "num_tokens": 526945604.0, + "step": 13807 + }, + { + "epoch": 1.7565195267777636, + "grad_norm": 1.5389974117279053, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8610967993736267, + "num_tokens": 526983979.0, + "step": 13808 + }, + { + "epoch": 1.756646737056354, + "grad_norm": 1.5103228092193604, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8763927221298218, + "num_tokens": 527024707.0, + "step": 13809 + }, + { + "epoch": 1.7567739473349446, + "grad_norm": 1.5314921140670776, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8766852021217346, + "num_tokens": 527061206.0, + "step": 13810 + }, + { + "epoch": 1.7569011576135352, + "grad_norm": 1.47739839553833, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8798990249633789, + "num_tokens": 527100929.0, + "step": 13811 + }, + { + "epoch": 1.7570283678921257, + "grad_norm": 1.4807151556015015, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8757007122039795, + "num_tokens": 527137806.0, + "step": 13812 + }, + { + "epoch": 1.7571555781707162, + "grad_norm": 1.6369251012802124, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8616122007369995, + "num_tokens": 527174255.0, + "step": 13813 + }, + { + "epoch": 1.7572827884493067, + "grad_norm": 1.4116981029510498, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8757427930831909, + "num_tokens": 527214919.0, + "step": 13814 + }, + { + "epoch": 1.7574099987278973, + "grad_norm": 1.7075799703598022, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8705331087112427, + "num_tokens": 527246155.0, + "step": 13815 + }, + { + "epoch": 1.7575372090064878, + "grad_norm": 1.540999412536621, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8768277168273926, + "num_tokens": 527283244.0, + "step": 13816 + }, + { + "epoch": 1.7576644192850783, + "grad_norm": 1.511156678199768, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.884787917137146, + "num_tokens": 527321066.0, + "step": 13817 + }, + { + "epoch": 1.7577916295636689, + "grad_norm": 1.5494349002838135, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8723688721656799, + "num_tokens": 527356504.0, + "step": 13818 + }, + { + "epoch": 1.7579188398422594, + "grad_norm": 1.4149179458618164, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8788495063781738, + "num_tokens": 527395390.0, + "step": 13819 + }, + { + "epoch": 1.75804605012085, + "grad_norm": 1.5743756294250488, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8910860419273376, + "num_tokens": 527427390.0, + "step": 13820 + }, + { + "epoch": 1.7581732603994404, + "grad_norm": 1.4231743812561035, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8774634599685669, + "num_tokens": 527468731.0, + "step": 13821 + }, + { + "epoch": 1.7583004706780307, + "grad_norm": 1.4558289051055908, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8938417434692383, + "num_tokens": 527505487.0, + "step": 13822 + }, + { + "epoch": 1.7584276809566213, + "grad_norm": 1.3378034830093384, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8925515413284302, + "num_tokens": 527546748.0, + "step": 13823 + }, + { + "epoch": 1.7585548912352118, + "grad_norm": 1.3646458387374878, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8886142373085022, + "num_tokens": 527589443.0, + "step": 13824 + }, + { + "epoch": 1.7586821015138023, + "grad_norm": 1.5140146017074585, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8819657564163208, + "num_tokens": 527627841.0, + "step": 13825 + }, + { + "epoch": 1.7588093117923929, + "grad_norm": 1.5567386150360107, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.888512134552002, + "num_tokens": 527660423.0, + "step": 13826 + }, + { + "epoch": 1.7589365220709834, + "grad_norm": 1.6498003005981445, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8966301679611206, + "num_tokens": 527695164.0, + "step": 13827 + }, + { + "epoch": 1.7590637323495737, + "grad_norm": 1.474300742149353, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8739628195762634, + "num_tokens": 527736875.0, + "step": 13828 + }, + { + "epoch": 1.7591909426281642, + "grad_norm": 1.4958301782608032, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8741790056228638, + "num_tokens": 527773174.0, + "step": 13829 + }, + { + "epoch": 1.7593181529067548, + "grad_norm": 1.50266432762146, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8724565505981445, + "num_tokens": 527814824.0, + "step": 13830 + }, + { + "epoch": 1.7594453631853453, + "grad_norm": 1.572016954421997, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.875426173210144, + "num_tokens": 527852706.0, + "step": 13831 + }, + { + "epoch": 1.7595725734639358, + "grad_norm": 1.5409222841262817, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8654418587684631, + "num_tokens": 527889567.0, + "step": 13832 + }, + { + "epoch": 1.7596997837425263, + "grad_norm": 1.3802855014801025, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8803212642669678, + "num_tokens": 527933152.0, + "step": 13833 + }, + { + "epoch": 1.7598269940211169, + "grad_norm": 1.444152593612671, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8719615936279297, + "num_tokens": 527972547.0, + "step": 13834 + }, + { + "epoch": 1.7599542042997074, + "grad_norm": 1.5743112564086914, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8710789680480957, + "num_tokens": 528008783.0, + "step": 13835 + }, + { + "epoch": 1.760081414578298, + "grad_norm": 1.3978692293167114, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8921940922737122, + "num_tokens": 528046684.0, + "step": 13836 + }, + { + "epoch": 1.7602086248568884, + "grad_norm": 1.2662073373794556, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8877255916595459, + "num_tokens": 528095573.0, + "step": 13837 + }, + { + "epoch": 1.760335835135479, + "grad_norm": 1.4735254049301147, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8711791038513184, + "num_tokens": 528135732.0, + "step": 13838 + }, + { + "epoch": 1.7604630454140695, + "grad_norm": 1.4625366926193237, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8737550377845764, + "num_tokens": 528175260.0, + "step": 13839 + }, + { + "epoch": 1.76059025569266, + "grad_norm": 1.4882091283798218, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.864019513130188, + "num_tokens": 528218178.0, + "step": 13840 + }, + { + "epoch": 1.7607174659712506, + "grad_norm": 1.540090560913086, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.869104266166687, + "num_tokens": 528256241.0, + "step": 13841 + }, + { + "epoch": 1.760844676249841, + "grad_norm": 1.2804275751113892, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8777278661727905, + "num_tokens": 528303384.0, + "step": 13842 + }, + { + "epoch": 1.7609718865284316, + "grad_norm": 1.3954026699066162, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8843499422073364, + "num_tokens": 528345225.0, + "step": 13843 + }, + { + "epoch": 1.7610990968070221, + "grad_norm": 1.4411245584487915, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.876800537109375, + "num_tokens": 528388101.0, + "step": 13844 + }, + { + "epoch": 1.7612263070856127, + "grad_norm": 1.4901055097579956, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8714160919189453, + "num_tokens": 528425977.0, + "step": 13845 + }, + { + "epoch": 1.761353517364203, + "grad_norm": 1.5339453220367432, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8785518407821655, + "num_tokens": 528459264.0, + "step": 13846 + }, + { + "epoch": 1.7614807276427935, + "grad_norm": 1.6592823266983032, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.867326557636261, + "num_tokens": 528492347.0, + "step": 13847 + }, + { + "epoch": 1.761607937921384, + "grad_norm": 1.5136489868164062, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8728245496749878, + "num_tokens": 528528006.0, + "step": 13848 + }, + { + "epoch": 1.7617351481999746, + "grad_norm": 1.5257279872894287, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8778525590896606, + "num_tokens": 528566396.0, + "step": 13849 + }, + { + "epoch": 1.761862358478565, + "grad_norm": 1.41432523727417, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8927582502365112, + "num_tokens": 528607876.0, + "step": 13850 + }, + { + "epoch": 1.7619895687571556, + "grad_norm": 1.629421353340149, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8820163607597351, + "num_tokens": 528639711.0, + "step": 13851 + }, + { + "epoch": 1.762116779035746, + "grad_norm": 1.543573260307312, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8775205612182617, + "num_tokens": 528676202.0, + "step": 13852 + }, + { + "epoch": 1.7622439893143365, + "grad_norm": 1.5911701917648315, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.896135687828064, + "num_tokens": 528705532.0, + "step": 13853 + }, + { + "epoch": 1.762371199592927, + "grad_norm": 1.5121359825134277, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8645531535148621, + "num_tokens": 528742586.0, + "step": 13854 + }, + { + "epoch": 1.7624984098715175, + "grad_norm": 1.447572112083435, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.885881781578064, + "num_tokens": 528784653.0, + "step": 13855 + }, + { + "epoch": 1.762625620150108, + "grad_norm": 1.5677250623703003, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8626593351364136, + "num_tokens": 528822160.0, + "step": 13856 + }, + { + "epoch": 1.7627528304286986, + "grad_norm": 1.3963186740875244, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.882496178150177, + "num_tokens": 528863587.0, + "step": 13857 + }, + { + "epoch": 1.762880040707289, + "grad_norm": 1.428139567375183, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8719079494476318, + "num_tokens": 528903780.0, + "step": 13858 + }, + { + "epoch": 1.7630072509858796, + "grad_norm": 1.7166279554367065, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8741028308868408, + "num_tokens": 528935625.0, + "step": 13859 + }, + { + "epoch": 1.7631344612644702, + "grad_norm": 1.6024993658065796, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.872096061706543, + "num_tokens": 528970286.0, + "step": 13860 + }, + { + "epoch": 1.7632616715430607, + "grad_norm": 1.3969420194625854, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8893904685974121, + "num_tokens": 529012579.0, + "step": 13861 + }, + { + "epoch": 1.7633888818216512, + "grad_norm": 1.517964243888855, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.886012613773346, + "num_tokens": 529047482.0, + "step": 13862 + }, + { + "epoch": 1.7635160921002417, + "grad_norm": 1.5061765909194946, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8923650979995728, + "num_tokens": 529082583.0, + "step": 13863 + }, + { + "epoch": 1.7636433023788323, + "grad_norm": 1.7653849124908447, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8619827032089233, + "num_tokens": 529119335.0, + "step": 13864 + }, + { + "epoch": 1.7637705126574228, + "grad_norm": 1.4588432312011719, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8666894435882568, + "num_tokens": 529158760.0, + "step": 13865 + }, + { + "epoch": 1.7638977229360133, + "grad_norm": 1.4772312641143799, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8909316062927246, + "num_tokens": 529198302.0, + "step": 13866 + }, + { + "epoch": 1.7640249332146039, + "grad_norm": 1.5727852582931519, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8622061014175415, + "num_tokens": 529234724.0, + "step": 13867 + }, + { + "epoch": 1.7641521434931944, + "grad_norm": 1.525575876235962, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8868242502212524, + "num_tokens": 529273332.0, + "step": 13868 + }, + { + "epoch": 1.764279353771785, + "grad_norm": 1.416929841041565, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8657485246658325, + "num_tokens": 529318540.0, + "step": 13869 + }, + { + "epoch": 1.7644065640503754, + "grad_norm": 1.4944554567337036, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8890629410743713, + "num_tokens": 529354455.0, + "step": 13870 + }, + { + "epoch": 1.7645337743289657, + "grad_norm": 1.440500020980835, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.895635187625885, + "num_tokens": 529391779.0, + "step": 13871 + }, + { + "epoch": 1.7646609846075563, + "grad_norm": 1.4628490209579468, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8736565113067627, + "num_tokens": 529433152.0, + "step": 13872 + }, + { + "epoch": 1.7647881948861468, + "grad_norm": 1.5981723070144653, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8873203992843628, + "num_tokens": 529471013.0, + "step": 13873 + }, + { + "epoch": 1.7649154051647373, + "grad_norm": 1.4892221689224243, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.879548192024231, + "num_tokens": 529512314.0, + "step": 13874 + }, + { + "epoch": 1.7650426154433279, + "grad_norm": 1.5603923797607422, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8675665855407715, + "num_tokens": 529552928.0, + "step": 13875 + }, + { + "epoch": 1.7651698257219184, + "grad_norm": 1.5174492597579956, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8685128688812256, + "num_tokens": 529591348.0, + "step": 13876 + }, + { + "epoch": 1.7652970360005087, + "grad_norm": 1.4242651462554932, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8840085864067078, + "num_tokens": 529628241.0, + "step": 13877 + }, + { + "epoch": 1.7654242462790992, + "grad_norm": 1.3975228071212769, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8801847696304321, + "num_tokens": 529672245.0, + "step": 13878 + }, + { + "epoch": 1.7655514565576897, + "grad_norm": 1.4250082969665527, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8818950653076172, + "num_tokens": 529714483.0, + "step": 13879 + }, + { + "epoch": 1.7656786668362803, + "grad_norm": 1.551647663116455, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8846352696418762, + "num_tokens": 529749652.0, + "step": 13880 + }, + { + "epoch": 1.7658058771148708, + "grad_norm": 1.5779473781585693, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8778045773506165, + "num_tokens": 529786670.0, + "step": 13881 + }, + { + "epoch": 1.7659330873934613, + "grad_norm": 1.4697421789169312, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8776929974555969, + "num_tokens": 529826372.0, + "step": 13882 + }, + { + "epoch": 1.7660602976720519, + "grad_norm": 1.4015424251556396, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8793487548828125, + "num_tokens": 529869143.0, + "step": 13883 + }, + { + "epoch": 1.7661875079506424, + "grad_norm": 1.4540984630584717, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8774064779281616, + "num_tokens": 529910087.0, + "step": 13884 + }, + { + "epoch": 1.766314718229233, + "grad_norm": 1.4795701503753662, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8739893436431885, + "num_tokens": 529947763.0, + "step": 13885 + }, + { + "epoch": 1.7664419285078234, + "grad_norm": 1.5413062572479248, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8787007331848145, + "num_tokens": 529984567.0, + "step": 13886 + }, + { + "epoch": 1.766569138786414, + "grad_norm": 1.5767452716827393, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8731639385223389, + "num_tokens": 530021233.0, + "step": 13887 + }, + { + "epoch": 1.7666963490650045, + "grad_norm": 1.5765279531478882, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8641725778579712, + "num_tokens": 530060876.0, + "step": 13888 + }, + { + "epoch": 1.766823559343595, + "grad_norm": 1.385847568511963, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8747868537902832, + "num_tokens": 530105695.0, + "step": 13889 + }, + { + "epoch": 1.7669507696221856, + "grad_norm": 1.4617016315460205, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8841219544410706, + "num_tokens": 530146123.0, + "step": 13890 + }, + { + "epoch": 1.767077979900776, + "grad_norm": 1.4128422737121582, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8807389736175537, + "num_tokens": 530186452.0, + "step": 13891 + }, + { + "epoch": 1.7672051901793666, + "grad_norm": 1.4655442237854004, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8749071359634399, + "num_tokens": 530224594.0, + "step": 13892 + }, + { + "epoch": 1.7673324004579571, + "grad_norm": 1.4638663530349731, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.873466432094574, + "num_tokens": 530263862.0, + "step": 13893 + }, + { + "epoch": 1.7674596107365477, + "grad_norm": 1.5214921236038208, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8664114475250244, + "num_tokens": 530304431.0, + "step": 13894 + }, + { + "epoch": 1.767586821015138, + "grad_norm": 1.5178650617599487, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8797677755355835, + "num_tokens": 530340686.0, + "step": 13895 + }, + { + "epoch": 1.7677140312937285, + "grad_norm": 1.4985278844833374, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8862407207489014, + "num_tokens": 530376326.0, + "step": 13896 + }, + { + "epoch": 1.767841241572319, + "grad_norm": 1.3549495935440063, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8850107192993164, + "num_tokens": 530420570.0, + "step": 13897 + }, + { + "epoch": 1.7679684518509096, + "grad_norm": 1.583307147026062, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8778738975524902, + "num_tokens": 530454440.0, + "step": 13898 + }, + { + "epoch": 1.7680956621295, + "grad_norm": 1.4485543966293335, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8793788552284241, + "num_tokens": 530494411.0, + "step": 13899 + }, + { + "epoch": 1.7682228724080906, + "grad_norm": 1.6050125360488892, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8565722703933716, + "num_tokens": 530528451.0, + "step": 13900 + }, + { + "epoch": 1.768350082686681, + "grad_norm": 1.4041945934295654, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8788973093032837, + "num_tokens": 530570232.0, + "step": 13901 + }, + { + "epoch": 1.7684772929652715, + "grad_norm": 1.4324368238449097, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8748984336853027, + "num_tokens": 530607906.0, + "step": 13902 + }, + { + "epoch": 1.768604503243862, + "grad_norm": 1.5438615083694458, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8695617318153381, + "num_tokens": 530647373.0, + "step": 13903 + }, + { + "epoch": 1.7687317135224525, + "grad_norm": 1.6017074584960938, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8709732294082642, + "num_tokens": 530683319.0, + "step": 13904 + }, + { + "epoch": 1.768858923801043, + "grad_norm": 1.6130141019821167, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8560368418693542, + "num_tokens": 530724528.0, + "step": 13905 + }, + { + "epoch": 1.7689861340796336, + "grad_norm": 1.5402122735977173, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8773086667060852, + "num_tokens": 530760525.0, + "step": 13906 + }, + { + "epoch": 1.769113344358224, + "grad_norm": 1.5170292854309082, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8818861246109009, + "num_tokens": 530794698.0, + "step": 13907 + }, + { + "epoch": 1.7692405546368146, + "grad_norm": 1.8492345809936523, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8766112923622131, + "num_tokens": 530827098.0, + "step": 13908 + }, + { + "epoch": 1.7693677649154052, + "grad_norm": 1.742070198059082, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8644729852676392, + "num_tokens": 530860636.0, + "step": 13909 + }, + { + "epoch": 1.7694949751939957, + "grad_norm": 1.346696138381958, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8839782476425171, + "num_tokens": 530903173.0, + "step": 13910 + }, + { + "epoch": 1.7696221854725862, + "grad_norm": 1.5030940771102905, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8745065927505493, + "num_tokens": 530940450.0, + "step": 13911 + }, + { + "epoch": 1.7697493957511767, + "grad_norm": 1.3859477043151855, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8802027106285095, + "num_tokens": 530981832.0, + "step": 13912 + }, + { + "epoch": 1.7698766060297673, + "grad_norm": 1.3944447040557861, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8748302459716797, + "num_tokens": 531025372.0, + "step": 13913 + }, + { + "epoch": 1.7700038163083578, + "grad_norm": 1.5572365522384644, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8783900141716003, + "num_tokens": 531062170.0, + "step": 13914 + }, + { + "epoch": 1.7701310265869483, + "grad_norm": 1.5455589294433594, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8859046101570129, + "num_tokens": 531102504.0, + "step": 13915 + }, + { + "epoch": 1.7702582368655388, + "grad_norm": 1.5382698774337769, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8721163868904114, + "num_tokens": 531138265.0, + "step": 13916 + }, + { + "epoch": 1.7703854471441294, + "grad_norm": 1.5553141832351685, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.875478982925415, + "num_tokens": 531173796.0, + "step": 13917 + }, + { + "epoch": 1.77051265742272, + "grad_norm": 1.4439246654510498, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8785219788551331, + "num_tokens": 531213781.0, + "step": 13918 + }, + { + "epoch": 1.7706398677013102, + "grad_norm": 1.5289056301116943, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8875917792320251, + "num_tokens": 531245637.0, + "step": 13919 + }, + { + "epoch": 1.7707670779799007, + "grad_norm": 1.6842747926712036, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8630584478378296, + "num_tokens": 531281699.0, + "step": 13920 + }, + { + "epoch": 1.7708942882584913, + "grad_norm": 1.43061363697052, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8719215393066406, + "num_tokens": 531325451.0, + "step": 13921 + }, + { + "epoch": 1.7710214985370818, + "grad_norm": 1.559821605682373, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.867258608341217, + "num_tokens": 531362478.0, + "step": 13922 + }, + { + "epoch": 1.7711487088156723, + "grad_norm": 1.5448031425476074, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8842984437942505, + "num_tokens": 531397854.0, + "step": 13923 + }, + { + "epoch": 1.7712759190942629, + "grad_norm": 1.5695863962173462, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8784208297729492, + "num_tokens": 531432281.0, + "step": 13924 + }, + { + "epoch": 1.7714031293728534, + "grad_norm": 1.493266224861145, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8715861439704895, + "num_tokens": 531470842.0, + "step": 13925 + }, + { + "epoch": 1.7715303396514437, + "grad_norm": 1.6429013013839722, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8589370250701904, + "num_tokens": 531505265.0, + "step": 13926 + }, + { + "epoch": 1.7716575499300342, + "grad_norm": 1.472133755683899, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.874975323677063, + "num_tokens": 531544046.0, + "step": 13927 + }, + { + "epoch": 1.7717847602086247, + "grad_norm": 1.4521313905715942, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8840779066085815, + "num_tokens": 531585353.0, + "step": 13928 + }, + { + "epoch": 1.7719119704872153, + "grad_norm": 1.5381338596343994, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8831207752227783, + "num_tokens": 531620184.0, + "step": 13929 + }, + { + "epoch": 1.7720391807658058, + "grad_norm": 1.454382061958313, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8704557418823242, + "num_tokens": 531660926.0, + "step": 13930 + }, + { + "epoch": 1.7721663910443963, + "grad_norm": 1.4305737018585205, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8639876842498779, + "num_tokens": 531706895.0, + "step": 13931 + }, + { + "epoch": 1.7722936013229869, + "grad_norm": 1.5151422023773193, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8941471576690674, + "num_tokens": 531738603.0, + "step": 13932 + }, + { + "epoch": 1.7724208116015774, + "grad_norm": 1.492104411125183, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8833343982696533, + "num_tokens": 531776103.0, + "step": 13933 + }, + { + "epoch": 1.772548021880168, + "grad_norm": 1.4082701206207275, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8802388906478882, + "num_tokens": 531818996.0, + "step": 13934 + }, + { + "epoch": 1.7726752321587584, + "grad_norm": 1.4619150161743164, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8834021687507629, + "num_tokens": 531855318.0, + "step": 13935 + }, + { + "epoch": 1.772802442437349, + "grad_norm": 1.4782137870788574, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8680629134178162, + "num_tokens": 531897879.0, + "step": 13936 + }, + { + "epoch": 1.7729296527159395, + "grad_norm": 1.5509613752365112, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8816407918930054, + "num_tokens": 531934815.0, + "step": 13937 + }, + { + "epoch": 1.77305686299453, + "grad_norm": 1.5617082118988037, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8800036907196045, + "num_tokens": 531969480.0, + "step": 13938 + }, + { + "epoch": 1.7731840732731206, + "grad_norm": 1.4008219242095947, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8829441070556641, + "num_tokens": 532009406.0, + "step": 13939 + }, + { + "epoch": 1.773311283551711, + "grad_norm": 1.5422186851501465, + "learning_rate": 1e-06, + "loss": 0.2853, + "mean_token_accuracy": 0.8939173221588135, + "num_tokens": 532044793.0, + "step": 13940 + }, + { + "epoch": 1.7734384938303016, + "grad_norm": 1.5837630033493042, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8527406454086304, + "num_tokens": 532083456.0, + "step": 13941 + }, + { + "epoch": 1.7735657041088921, + "grad_norm": 1.658424973487854, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8680673241615295, + "num_tokens": 532124773.0, + "step": 13942 + }, + { + "epoch": 1.7736929143874827, + "grad_norm": 1.6223845481872559, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8864408731460571, + "num_tokens": 532156482.0, + "step": 13943 + }, + { + "epoch": 1.773820124666073, + "grad_norm": 1.530144453048706, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8845997452735901, + "num_tokens": 532191413.0, + "step": 13944 + }, + { + "epoch": 1.7739473349446635, + "grad_norm": 1.760286808013916, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8851951956748962, + "num_tokens": 532227242.0, + "step": 13945 + }, + { + "epoch": 1.774074545223254, + "grad_norm": 1.523213267326355, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8806498646736145, + "num_tokens": 532262822.0, + "step": 13946 + }, + { + "epoch": 1.7742017555018446, + "grad_norm": 1.4346935749053955, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8939623236656189, + "num_tokens": 532302993.0, + "step": 13947 + }, + { + "epoch": 1.774328965780435, + "grad_norm": 1.5539546012878418, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8787680268287659, + "num_tokens": 532338137.0, + "step": 13948 + }, + { + "epoch": 1.7744561760590256, + "grad_norm": 1.6655762195587158, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8727037906646729, + "num_tokens": 532374103.0, + "step": 13949 + }, + { + "epoch": 1.774583386337616, + "grad_norm": 1.5628719329833984, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8762321472167969, + "num_tokens": 532411305.0, + "step": 13950 + }, + { + "epoch": 1.7747105966162064, + "grad_norm": 1.6113446950912476, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8816717863082886, + "num_tokens": 532443909.0, + "step": 13951 + }, + { + "epoch": 1.774837806894797, + "grad_norm": 1.6386297941207886, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8845877647399902, + "num_tokens": 532475327.0, + "step": 13952 + }, + { + "epoch": 1.7749650171733875, + "grad_norm": 1.4538203477859497, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.882125973701477, + "num_tokens": 532516514.0, + "step": 13953 + }, + { + "epoch": 1.775092227451978, + "grad_norm": 1.4350093603134155, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8792749643325806, + "num_tokens": 532558875.0, + "step": 13954 + }, + { + "epoch": 1.7752194377305686, + "grad_norm": 1.4753899574279785, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8641728758811951, + "num_tokens": 532601619.0, + "step": 13955 + }, + { + "epoch": 1.775346648009159, + "grad_norm": 1.5405446290969849, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8797781467437744, + "num_tokens": 532634821.0, + "step": 13956 + }, + { + "epoch": 1.7754738582877496, + "grad_norm": 1.637683629989624, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.871870756149292, + "num_tokens": 532670927.0, + "step": 13957 + }, + { + "epoch": 1.7756010685663401, + "grad_norm": 1.5277782678604126, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8652151823043823, + "num_tokens": 532714159.0, + "step": 13958 + }, + { + "epoch": 1.7757282788449307, + "grad_norm": 1.521845817565918, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8766379356384277, + "num_tokens": 532749708.0, + "step": 13959 + }, + { + "epoch": 1.7758554891235212, + "grad_norm": 1.3885555267333984, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8877216577529907, + "num_tokens": 532790865.0, + "step": 13960 + }, + { + "epoch": 1.7759826994021117, + "grad_norm": 1.4576557874679565, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8773743510246277, + "num_tokens": 532827709.0, + "step": 13961 + }, + { + "epoch": 1.7761099096807023, + "grad_norm": 1.428544044494629, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8689538240432739, + "num_tokens": 532867915.0, + "step": 13962 + }, + { + "epoch": 1.7762371199592928, + "grad_norm": 1.4414894580841064, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8744531869888306, + "num_tokens": 532910634.0, + "step": 13963 + }, + { + "epoch": 1.7763643302378833, + "grad_norm": 1.4405452013015747, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.875605583190918, + "num_tokens": 532949666.0, + "step": 13964 + }, + { + "epoch": 1.7764915405164738, + "grad_norm": 1.368973731994629, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8799411654472351, + "num_tokens": 532992020.0, + "step": 13965 + }, + { + "epoch": 1.7766187507950644, + "grad_norm": 1.5416251420974731, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8698293566703796, + "num_tokens": 533028029.0, + "step": 13966 + }, + { + "epoch": 1.776745961073655, + "grad_norm": 1.4906188249588013, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8730655312538147, + "num_tokens": 533070375.0, + "step": 13967 + }, + { + "epoch": 1.7768731713522452, + "grad_norm": 1.452384114265442, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8836391568183899, + "num_tokens": 533113160.0, + "step": 13968 + }, + { + "epoch": 1.7770003816308357, + "grad_norm": 1.6388641595840454, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8505121469497681, + "num_tokens": 533152715.0, + "step": 13969 + }, + { + "epoch": 1.7771275919094263, + "grad_norm": 1.5608797073364258, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8748127222061157, + "num_tokens": 533188048.0, + "step": 13970 + }, + { + "epoch": 1.7772548021880168, + "grad_norm": 1.552706003189087, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8639024496078491, + "num_tokens": 533226803.0, + "step": 13971 + }, + { + "epoch": 1.7773820124666073, + "grad_norm": 1.6404463052749634, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8724199533462524, + "num_tokens": 533263469.0, + "step": 13972 + }, + { + "epoch": 1.7775092227451978, + "grad_norm": 1.5818657875061035, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8681594133377075, + "num_tokens": 533301432.0, + "step": 13973 + }, + { + "epoch": 1.7776364330237884, + "grad_norm": 1.6042817831039429, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8850778341293335, + "num_tokens": 533335130.0, + "step": 13974 + }, + { + "epoch": 1.7777636433023787, + "grad_norm": 1.3442333936691284, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.891390323638916, + "num_tokens": 533374815.0, + "step": 13975 + }, + { + "epoch": 1.7778908535809692, + "grad_norm": 1.5600289106369019, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8866546750068665, + "num_tokens": 533410518.0, + "step": 13976 + }, + { + "epoch": 1.7780180638595597, + "grad_norm": 1.6636677980422974, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8766580820083618, + "num_tokens": 533445719.0, + "step": 13977 + }, + { + "epoch": 1.7781452741381503, + "grad_norm": 1.488116979598999, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8734050393104553, + "num_tokens": 533485705.0, + "step": 13978 + }, + { + "epoch": 1.7782724844167408, + "grad_norm": 1.4754098653793335, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8884146213531494, + "num_tokens": 533521228.0, + "step": 13979 + }, + { + "epoch": 1.7783996946953313, + "grad_norm": 1.490892767906189, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8751388788223267, + "num_tokens": 533558233.0, + "step": 13980 + }, + { + "epoch": 1.7785269049739219, + "grad_norm": 1.5603680610656738, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8727275133132935, + "num_tokens": 533595966.0, + "step": 13981 + }, + { + "epoch": 1.7786541152525124, + "grad_norm": 1.5808557271957397, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8763737678527832, + "num_tokens": 533629227.0, + "step": 13982 + }, + { + "epoch": 1.778781325531103, + "grad_norm": 1.5490244626998901, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8835607767105103, + "num_tokens": 533664902.0, + "step": 13983 + }, + { + "epoch": 1.7789085358096934, + "grad_norm": 1.487998366355896, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8874894380569458, + "num_tokens": 533702873.0, + "step": 13984 + }, + { + "epoch": 1.779035746088284, + "grad_norm": 1.5350335836410522, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8692255616188049, + "num_tokens": 533737295.0, + "step": 13985 + }, + { + "epoch": 1.7791629563668745, + "grad_norm": 1.5779366493225098, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8692353963851929, + "num_tokens": 533774180.0, + "step": 13986 + }, + { + "epoch": 1.779290166645465, + "grad_norm": 1.464315414428711, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.868278443813324, + "num_tokens": 533811179.0, + "step": 13987 + }, + { + "epoch": 1.7794173769240555, + "grad_norm": 1.470628023147583, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8754392862319946, + "num_tokens": 533848892.0, + "step": 13988 + }, + { + "epoch": 1.779544587202646, + "grad_norm": 1.562099575996399, + "learning_rate": 1e-06, + "loss": 0.295, + "mean_token_accuracy": 0.894384503364563, + "num_tokens": 533883907.0, + "step": 13989 + }, + { + "epoch": 1.7796717974812366, + "grad_norm": 1.5307848453521729, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8778665065765381, + "num_tokens": 533921550.0, + "step": 13990 + }, + { + "epoch": 1.7797990077598271, + "grad_norm": 1.4772692918777466, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8749694228172302, + "num_tokens": 533961319.0, + "step": 13991 + }, + { + "epoch": 1.7799262180384177, + "grad_norm": 1.6333070993423462, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8818687200546265, + "num_tokens": 533997953.0, + "step": 13992 + }, + { + "epoch": 1.780053428317008, + "grad_norm": 1.5470343828201294, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8788814544677734, + "num_tokens": 534035019.0, + "step": 13993 + }, + { + "epoch": 1.7801806385955985, + "grad_norm": 1.5747663974761963, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.868411123752594, + "num_tokens": 534073011.0, + "step": 13994 + }, + { + "epoch": 1.780307848874189, + "grad_norm": 1.7934404611587524, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8559658527374268, + "num_tokens": 534105849.0, + "step": 13995 + }, + { + "epoch": 1.7804350591527796, + "grad_norm": 1.458331823348999, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8683180809020996, + "num_tokens": 534149647.0, + "step": 13996 + }, + { + "epoch": 1.78056226943137, + "grad_norm": 1.5946305990219116, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8768845796585083, + "num_tokens": 534183901.0, + "step": 13997 + }, + { + "epoch": 1.7806894797099606, + "grad_norm": 1.4645262956619263, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.879165530204773, + "num_tokens": 534221084.0, + "step": 13998 + }, + { + "epoch": 1.780816689988551, + "grad_norm": 1.5420300960540771, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8683344125747681, + "num_tokens": 534259071.0, + "step": 13999 + }, + { + "epoch": 1.7809439002671414, + "grad_norm": 1.3869359493255615, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8737396001815796, + "num_tokens": 534300964.0, + "step": 14000 + }, + { + "epoch": 1.781071110545732, + "grad_norm": 1.467370867729187, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8880939483642578, + "num_tokens": 534341312.0, + "step": 14001 + }, + { + "epoch": 1.7811983208243225, + "grad_norm": 1.4014639854431152, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8905736804008484, + "num_tokens": 534380746.0, + "step": 14002 + }, + { + "epoch": 1.781325531102913, + "grad_norm": 1.4689949750900269, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8664884567260742, + "num_tokens": 534422193.0, + "step": 14003 + }, + { + "epoch": 1.7814527413815036, + "grad_norm": 1.382347583770752, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8872884511947632, + "num_tokens": 534459899.0, + "step": 14004 + }, + { + "epoch": 1.781579951660094, + "grad_norm": 1.4672695398330688, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8862889409065247, + "num_tokens": 534498118.0, + "step": 14005 + }, + { + "epoch": 1.7817071619386846, + "grad_norm": 1.5069301128387451, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8739908337593079, + "num_tokens": 534538547.0, + "step": 14006 + }, + { + "epoch": 1.7818343722172751, + "grad_norm": 1.4515916109085083, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8798520565032959, + "num_tokens": 534577929.0, + "step": 14007 + }, + { + "epoch": 1.7819615824958657, + "grad_norm": 1.8667843341827393, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8540235757827759, + "num_tokens": 534616166.0, + "step": 14008 + }, + { + "epoch": 1.7820887927744562, + "grad_norm": 1.4977067708969116, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8780993223190308, + "num_tokens": 534653404.0, + "step": 14009 + }, + { + "epoch": 1.7822160030530467, + "grad_norm": 1.4319262504577637, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8669298887252808, + "num_tokens": 534696907.0, + "step": 14010 + }, + { + "epoch": 1.7823432133316373, + "grad_norm": 1.554787039756775, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8711377382278442, + "num_tokens": 534733742.0, + "step": 14011 + }, + { + "epoch": 1.7824704236102278, + "grad_norm": 1.5853407382965088, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.867291271686554, + "num_tokens": 534771065.0, + "step": 14012 + }, + { + "epoch": 1.7825976338888183, + "grad_norm": 1.3766822814941406, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8915375471115112, + "num_tokens": 534807401.0, + "step": 14013 + }, + { + "epoch": 1.7827248441674088, + "grad_norm": 1.4363967180252075, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8760791420936584, + "num_tokens": 534849885.0, + "step": 14014 + }, + { + "epoch": 1.7828520544459994, + "grad_norm": 1.4267841577529907, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8901532888412476, + "num_tokens": 534889413.0, + "step": 14015 + }, + { + "epoch": 1.78297926472459, + "grad_norm": 1.4252700805664062, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8719086050987244, + "num_tokens": 534929767.0, + "step": 14016 + }, + { + "epoch": 1.7831064750031802, + "grad_norm": 1.4766063690185547, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.87513267993927, + "num_tokens": 534970839.0, + "step": 14017 + }, + { + "epoch": 1.7832336852817707, + "grad_norm": 1.4574017524719238, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8872689604759216, + "num_tokens": 535009390.0, + "step": 14018 + }, + { + "epoch": 1.7833608955603613, + "grad_norm": 1.6626389026641846, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8801690936088562, + "num_tokens": 535041462.0, + "step": 14019 + }, + { + "epoch": 1.7834881058389518, + "grad_norm": 1.485892415046692, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.873974084854126, + "num_tokens": 535080436.0, + "step": 14020 + }, + { + "epoch": 1.7836153161175423, + "grad_norm": 1.580065131187439, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8793905973434448, + "num_tokens": 535117484.0, + "step": 14021 + }, + { + "epoch": 1.7837425263961328, + "grad_norm": 1.493680715560913, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8849915266036987, + "num_tokens": 535155218.0, + "step": 14022 + }, + { + "epoch": 1.7838697366747234, + "grad_norm": 1.5653070211410522, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8796544075012207, + "num_tokens": 535188893.0, + "step": 14023 + }, + { + "epoch": 1.7839969469533137, + "grad_norm": 1.565810203552246, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8672670722007751, + "num_tokens": 535226794.0, + "step": 14024 + }, + { + "epoch": 1.7841241572319042, + "grad_norm": 1.5420749187469482, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8776755928993225, + "num_tokens": 535264300.0, + "step": 14025 + }, + { + "epoch": 1.7842513675104947, + "grad_norm": 1.6426891088485718, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8747527599334717, + "num_tokens": 535297763.0, + "step": 14026 + }, + { + "epoch": 1.7843785777890853, + "grad_norm": 1.5629401206970215, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8702481985092163, + "num_tokens": 535336459.0, + "step": 14027 + }, + { + "epoch": 1.7845057880676758, + "grad_norm": 1.4654226303100586, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8644059300422668, + "num_tokens": 535376731.0, + "step": 14028 + }, + { + "epoch": 1.7846329983462663, + "grad_norm": 1.3456826210021973, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8906128406524658, + "num_tokens": 535417630.0, + "step": 14029 + }, + { + "epoch": 1.7847602086248568, + "grad_norm": 1.4495075941085815, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8925720453262329, + "num_tokens": 535456862.0, + "step": 14030 + }, + { + "epoch": 1.7848874189034474, + "grad_norm": 1.4810187816619873, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8738755583763123, + "num_tokens": 535496051.0, + "step": 14031 + }, + { + "epoch": 1.785014629182038, + "grad_norm": 1.4527946710586548, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.869841992855072, + "num_tokens": 535537359.0, + "step": 14032 + }, + { + "epoch": 1.7851418394606284, + "grad_norm": 1.567204475402832, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8490056395530701, + "num_tokens": 535578304.0, + "step": 14033 + }, + { + "epoch": 1.785269049739219, + "grad_norm": 1.5750741958618164, + "learning_rate": 1e-06, + "loss": 0.2866, + "mean_token_accuracy": 0.8987283706665039, + "num_tokens": 535610090.0, + "step": 14034 + }, + { + "epoch": 1.7853962600178095, + "grad_norm": 1.488701343536377, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8622535467147827, + "num_tokens": 535654945.0, + "step": 14035 + }, + { + "epoch": 1.7855234702964, + "grad_norm": 1.4509698152542114, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8821622729301453, + "num_tokens": 535697752.0, + "step": 14036 + }, + { + "epoch": 1.7856506805749905, + "grad_norm": 1.4204829931259155, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8708598613739014, + "num_tokens": 535740903.0, + "step": 14037 + }, + { + "epoch": 1.785777890853581, + "grad_norm": 1.6178369522094727, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8780776262283325, + "num_tokens": 535773415.0, + "step": 14038 + }, + { + "epoch": 1.7859051011321716, + "grad_norm": 1.5205974578857422, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8851214051246643, + "num_tokens": 535807759.0, + "step": 14039 + }, + { + "epoch": 1.7860323114107621, + "grad_norm": 1.495642900466919, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8744747638702393, + "num_tokens": 535845832.0, + "step": 14040 + }, + { + "epoch": 1.7861595216893527, + "grad_norm": 1.4732593297958374, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8867441415786743, + "num_tokens": 535886414.0, + "step": 14041 + }, + { + "epoch": 1.786286731967943, + "grad_norm": 1.5923184156417847, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8606494665145874, + "num_tokens": 535924920.0, + "step": 14042 + }, + { + "epoch": 1.7864139422465335, + "grad_norm": 1.4983723163604736, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8838242292404175, + "num_tokens": 535958095.0, + "step": 14043 + }, + { + "epoch": 1.786541152525124, + "grad_norm": 1.6200469732284546, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8712340593338013, + "num_tokens": 535993640.0, + "step": 14044 + }, + { + "epoch": 1.7866683628037145, + "grad_norm": 1.4240829944610596, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8652603626251221, + "num_tokens": 536037728.0, + "step": 14045 + }, + { + "epoch": 1.786795573082305, + "grad_norm": 1.657736897468567, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8704469203948975, + "num_tokens": 536069768.0, + "step": 14046 + }, + { + "epoch": 1.7869227833608956, + "grad_norm": 1.5485591888427734, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8822586536407471, + "num_tokens": 536102599.0, + "step": 14047 + }, + { + "epoch": 1.787049993639486, + "grad_norm": 1.4046306610107422, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8750243186950684, + "num_tokens": 536142599.0, + "step": 14048 + }, + { + "epoch": 1.7871772039180764, + "grad_norm": 1.4119586944580078, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8903266191482544, + "num_tokens": 536181054.0, + "step": 14049 + }, + { + "epoch": 1.787304414196667, + "grad_norm": 1.51549232006073, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8774584531784058, + "num_tokens": 536218146.0, + "step": 14050 + }, + { + "epoch": 1.7874316244752575, + "grad_norm": 1.5151454210281372, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8715213537216187, + "num_tokens": 536254911.0, + "step": 14051 + }, + { + "epoch": 1.787558834753848, + "grad_norm": 1.5168044567108154, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8889802694320679, + "num_tokens": 536290663.0, + "step": 14052 + }, + { + "epoch": 1.7876860450324386, + "grad_norm": 1.7130458354949951, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8746907711029053, + "num_tokens": 536320198.0, + "step": 14053 + }, + { + "epoch": 1.787813255311029, + "grad_norm": 1.5657110214233398, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8821722269058228, + "num_tokens": 536356770.0, + "step": 14054 + }, + { + "epoch": 1.7879404655896196, + "grad_norm": 1.4828228950500488, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8725366592407227, + "num_tokens": 536394868.0, + "step": 14055 + }, + { + "epoch": 1.7880676758682101, + "grad_norm": 1.458899974822998, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8764272928237915, + "num_tokens": 536433609.0, + "step": 14056 + }, + { + "epoch": 1.7881948861468007, + "grad_norm": 1.4492807388305664, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8819034099578857, + "num_tokens": 536470941.0, + "step": 14057 + }, + { + "epoch": 1.7883220964253912, + "grad_norm": 1.4818321466445923, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8704335689544678, + "num_tokens": 536512156.0, + "step": 14058 + }, + { + "epoch": 1.7884493067039817, + "grad_norm": 1.4219619035720825, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8834347128868103, + "num_tokens": 536550374.0, + "step": 14059 + }, + { + "epoch": 1.7885765169825723, + "grad_norm": 1.5260045528411865, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8718833327293396, + "num_tokens": 536592894.0, + "step": 14060 + }, + { + "epoch": 1.7887037272611628, + "grad_norm": 1.3665658235549927, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8838313221931458, + "num_tokens": 536632588.0, + "step": 14061 + }, + { + "epoch": 1.7888309375397533, + "grad_norm": 1.4773446321487427, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8731905817985535, + "num_tokens": 536670624.0, + "step": 14062 + }, + { + "epoch": 1.7889581478183438, + "grad_norm": 1.3929781913757324, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8894060850143433, + "num_tokens": 536711865.0, + "step": 14063 + }, + { + "epoch": 1.7890853580969344, + "grad_norm": 1.4439656734466553, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8686243295669556, + "num_tokens": 536751666.0, + "step": 14064 + }, + { + "epoch": 1.789212568375525, + "grad_norm": 1.4159091711044312, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8891661167144775, + "num_tokens": 536790852.0, + "step": 14065 + }, + { + "epoch": 1.7893397786541152, + "grad_norm": 1.4210268259048462, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8813336491584778, + "num_tokens": 536830594.0, + "step": 14066 + }, + { + "epoch": 1.7894669889327057, + "grad_norm": 1.4809389114379883, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8887811899185181, + "num_tokens": 536866286.0, + "step": 14067 + }, + { + "epoch": 1.7895941992112963, + "grad_norm": 1.4460822343826294, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8735066652297974, + "num_tokens": 536908035.0, + "step": 14068 + }, + { + "epoch": 1.7897214094898868, + "grad_norm": 1.4629350900650024, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8902149796485901, + "num_tokens": 536942897.0, + "step": 14069 + }, + { + "epoch": 1.7898486197684773, + "grad_norm": 1.64378023147583, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8665561676025391, + "num_tokens": 536978358.0, + "step": 14070 + }, + { + "epoch": 1.7899758300470678, + "grad_norm": 1.574845790863037, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8901063203811646, + "num_tokens": 537009098.0, + "step": 14071 + }, + { + "epoch": 1.7901030403256584, + "grad_norm": 1.3966737985610962, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8800692558288574, + "num_tokens": 537057050.0, + "step": 14072 + }, + { + "epoch": 1.7902302506042487, + "grad_norm": 1.367493748664856, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8913560509681702, + "num_tokens": 537094422.0, + "step": 14073 + }, + { + "epoch": 1.7903574608828392, + "grad_norm": 1.5322540998458862, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.881077766418457, + "num_tokens": 537129452.0, + "step": 14074 + }, + { + "epoch": 1.7904846711614297, + "grad_norm": 1.53056001663208, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8844202756881714, + "num_tokens": 537166728.0, + "step": 14075 + }, + { + "epoch": 1.7906118814400203, + "grad_norm": 1.6730256080627441, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8717800378799438, + "num_tokens": 537197293.0, + "step": 14076 + }, + { + "epoch": 1.7907390917186108, + "grad_norm": 1.4288438558578491, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8850921392440796, + "num_tokens": 537239915.0, + "step": 14077 + }, + { + "epoch": 1.7908663019972013, + "grad_norm": 1.499366283416748, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8871119618415833, + "num_tokens": 537276114.0, + "step": 14078 + }, + { + "epoch": 1.7909935122757918, + "grad_norm": 1.3760017156600952, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8806416988372803, + "num_tokens": 537319447.0, + "step": 14079 + }, + { + "epoch": 1.7911207225543824, + "grad_norm": 1.499203085899353, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8683341145515442, + "num_tokens": 537357552.0, + "step": 14080 + }, + { + "epoch": 1.791247932832973, + "grad_norm": 1.5054192543029785, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8742631673812866, + "num_tokens": 537394650.0, + "step": 14081 + }, + { + "epoch": 1.7913751431115634, + "grad_norm": 1.4264613389968872, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8775843381881714, + "num_tokens": 537435504.0, + "step": 14082 + }, + { + "epoch": 1.791502353390154, + "grad_norm": 1.4893097877502441, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8811430931091309, + "num_tokens": 537475737.0, + "step": 14083 + }, + { + "epoch": 1.7916295636687445, + "grad_norm": 1.351137399673462, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8846909999847412, + "num_tokens": 537518590.0, + "step": 14084 + }, + { + "epoch": 1.791756773947335, + "grad_norm": 1.5130201578140259, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.872402548789978, + "num_tokens": 537556613.0, + "step": 14085 + }, + { + "epoch": 1.7918839842259255, + "grad_norm": 1.5209639072418213, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8858798742294312, + "num_tokens": 537591325.0, + "step": 14086 + }, + { + "epoch": 1.792011194504516, + "grad_norm": 1.5297865867614746, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8669401407241821, + "num_tokens": 537629642.0, + "step": 14087 + }, + { + "epoch": 1.7921384047831066, + "grad_norm": 1.4848604202270508, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8789018392562866, + "num_tokens": 537666756.0, + "step": 14088 + }, + { + "epoch": 1.7922656150616971, + "grad_norm": 1.4685697555541992, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8842707872390747, + "num_tokens": 537703463.0, + "step": 14089 + }, + { + "epoch": 1.7923928253402877, + "grad_norm": 1.3434027433395386, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8942757248878479, + "num_tokens": 537744818.0, + "step": 14090 + }, + { + "epoch": 1.792520035618878, + "grad_norm": 1.5558912754058838, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.886547863483429, + "num_tokens": 537781887.0, + "step": 14091 + }, + { + "epoch": 1.7926472458974685, + "grad_norm": 1.4813321828842163, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8841142058372498, + "num_tokens": 537817996.0, + "step": 14092 + }, + { + "epoch": 1.792774456176059, + "grad_norm": 1.3757797479629517, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8678178787231445, + "num_tokens": 537864504.0, + "step": 14093 + }, + { + "epoch": 1.7929016664546495, + "grad_norm": 1.4432473182678223, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8767843842506409, + "num_tokens": 537905755.0, + "step": 14094 + }, + { + "epoch": 1.79302887673324, + "grad_norm": 1.826772928237915, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8844214677810669, + "num_tokens": 537932359.0, + "step": 14095 + }, + { + "epoch": 1.7931560870118306, + "grad_norm": 1.4224478006362915, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8872478008270264, + "num_tokens": 537973530.0, + "step": 14096 + }, + { + "epoch": 1.793283297290421, + "grad_norm": 1.4294553995132446, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8722285628318787, + "num_tokens": 538017684.0, + "step": 14097 + }, + { + "epoch": 1.7934105075690114, + "grad_norm": 1.4517738819122314, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.878982424736023, + "num_tokens": 538054872.0, + "step": 14098 + }, + { + "epoch": 1.793537717847602, + "grad_norm": 1.4854196310043335, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8841956853866577, + "num_tokens": 538091871.0, + "step": 14099 + }, + { + "epoch": 1.7936649281261925, + "grad_norm": 1.3759018182754517, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8894639015197754, + "num_tokens": 538131585.0, + "step": 14100 + }, + { + "epoch": 1.793792138404783, + "grad_norm": 1.5646753311157227, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8902574777603149, + "num_tokens": 538164235.0, + "step": 14101 + }, + { + "epoch": 1.7939193486833735, + "grad_norm": 1.5074982643127441, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8854044079780579, + "num_tokens": 538198164.0, + "step": 14102 + }, + { + "epoch": 1.794046558961964, + "grad_norm": 1.610100269317627, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8717331290245056, + "num_tokens": 538235219.0, + "step": 14103 + }, + { + "epoch": 1.7941737692405546, + "grad_norm": 1.4139715433120728, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8753811120986938, + "num_tokens": 538279862.0, + "step": 14104 + }, + { + "epoch": 1.7943009795191451, + "grad_norm": 1.5308150053024292, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8838098049163818, + "num_tokens": 538317292.0, + "step": 14105 + }, + { + "epoch": 1.7944281897977357, + "grad_norm": 1.5313475131988525, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8840947151184082, + "num_tokens": 538354416.0, + "step": 14106 + }, + { + "epoch": 1.7945554000763262, + "grad_norm": 1.5457675457000732, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8858204483985901, + "num_tokens": 538389353.0, + "step": 14107 + }, + { + "epoch": 1.7946826103549167, + "grad_norm": 1.568916916847229, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8796724677085876, + "num_tokens": 538423759.0, + "step": 14108 + }, + { + "epoch": 1.7948098206335072, + "grad_norm": 1.4794831275939941, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8783881664276123, + "num_tokens": 538465285.0, + "step": 14109 + }, + { + "epoch": 1.7949370309120978, + "grad_norm": 1.5155974626541138, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8897901773452759, + "num_tokens": 538507518.0, + "step": 14110 + }, + { + "epoch": 1.7950642411906883, + "grad_norm": 1.4211794137954712, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.878740668296814, + "num_tokens": 538549230.0, + "step": 14111 + }, + { + "epoch": 1.7951914514692788, + "grad_norm": 1.5892558097839355, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8855754137039185, + "num_tokens": 538579139.0, + "step": 14112 + }, + { + "epoch": 1.7953186617478694, + "grad_norm": 1.3604834079742432, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.877143144607544, + "num_tokens": 538626546.0, + "step": 14113 + }, + { + "epoch": 1.7954458720264599, + "grad_norm": 1.4375184774398804, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8851445317268372, + "num_tokens": 538666520.0, + "step": 14114 + }, + { + "epoch": 1.7955730823050502, + "grad_norm": 1.5542097091674805, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8753604888916016, + "num_tokens": 538702306.0, + "step": 14115 + }, + { + "epoch": 1.7957002925836407, + "grad_norm": 1.4265339374542236, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.876818835735321, + "num_tokens": 538743116.0, + "step": 14116 + }, + { + "epoch": 1.7958275028622313, + "grad_norm": 1.3847535848617554, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8705441951751709, + "num_tokens": 538790294.0, + "step": 14117 + }, + { + "epoch": 1.7959547131408218, + "grad_norm": 1.5879249572753906, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8738092184066772, + "num_tokens": 538829487.0, + "step": 14118 + }, + { + "epoch": 1.7960819234194123, + "grad_norm": 1.6091312170028687, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8685227632522583, + "num_tokens": 538863984.0, + "step": 14119 + }, + { + "epoch": 1.7962091336980028, + "grad_norm": 1.5582901239395142, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8563207387924194, + "num_tokens": 538908537.0, + "step": 14120 + }, + { + "epoch": 1.7963363439765934, + "grad_norm": 1.6567878723144531, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8706015944480896, + "num_tokens": 538942615.0, + "step": 14121 + }, + { + "epoch": 1.7964635542551837, + "grad_norm": 1.4549156427383423, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8701673746109009, + "num_tokens": 538986362.0, + "step": 14122 + }, + { + "epoch": 1.7965907645337742, + "grad_norm": 1.52671480178833, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8904063701629639, + "num_tokens": 539018864.0, + "step": 14123 + }, + { + "epoch": 1.7967179748123647, + "grad_norm": 1.4080084562301636, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8773602247238159, + "num_tokens": 539062741.0, + "step": 14124 + }, + { + "epoch": 1.7968451850909553, + "grad_norm": 1.4450623989105225, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8787428140640259, + "num_tokens": 539101034.0, + "step": 14125 + }, + { + "epoch": 1.7969723953695458, + "grad_norm": 1.529976725578308, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8746921420097351, + "num_tokens": 539141110.0, + "step": 14126 + }, + { + "epoch": 1.7970996056481363, + "grad_norm": 1.3332529067993164, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8841724395751953, + "num_tokens": 539183493.0, + "step": 14127 + }, + { + "epoch": 1.7972268159267268, + "grad_norm": 1.4121944904327393, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8831862211227417, + "num_tokens": 539224633.0, + "step": 14128 + }, + { + "epoch": 1.7973540262053174, + "grad_norm": 1.4849588871002197, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8904924392700195, + "num_tokens": 539257403.0, + "step": 14129 + }, + { + "epoch": 1.797481236483908, + "grad_norm": 1.5914251804351807, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8601377606391907, + "num_tokens": 539294988.0, + "step": 14130 + }, + { + "epoch": 1.7976084467624984, + "grad_norm": 1.6138657331466675, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8629810214042664, + "num_tokens": 539330752.0, + "step": 14131 + }, + { + "epoch": 1.797735657041089, + "grad_norm": 1.4530638456344604, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8762578368186951, + "num_tokens": 539373818.0, + "step": 14132 + }, + { + "epoch": 1.7978628673196795, + "grad_norm": 1.5245826244354248, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8806161880493164, + "num_tokens": 539408042.0, + "step": 14133 + }, + { + "epoch": 1.79799007759827, + "grad_norm": 1.5737322568893433, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8759207725524902, + "num_tokens": 539440376.0, + "step": 14134 + }, + { + "epoch": 1.7981172878768605, + "grad_norm": 1.651415228843689, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8786813020706177, + "num_tokens": 539469711.0, + "step": 14135 + }, + { + "epoch": 1.798244498155451, + "grad_norm": 1.4702740907669067, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.884080171585083, + "num_tokens": 539510310.0, + "step": 14136 + }, + { + "epoch": 1.7983717084340416, + "grad_norm": 1.4854661226272583, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8791086673736572, + "num_tokens": 539545965.0, + "step": 14137 + }, + { + "epoch": 1.7984989187126321, + "grad_norm": 1.5152519941329956, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8857228755950928, + "num_tokens": 539579701.0, + "step": 14138 + }, + { + "epoch": 1.7986261289912227, + "grad_norm": 1.4873907566070557, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8816707134246826, + "num_tokens": 539619861.0, + "step": 14139 + }, + { + "epoch": 1.798753339269813, + "grad_norm": 1.499672770500183, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8699270486831665, + "num_tokens": 539658334.0, + "step": 14140 + }, + { + "epoch": 1.7988805495484035, + "grad_norm": 1.6212761402130127, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.870099663734436, + "num_tokens": 539696931.0, + "step": 14141 + }, + { + "epoch": 1.799007759826994, + "grad_norm": 1.389528512954712, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8949477672576904, + "num_tokens": 539734997.0, + "step": 14142 + }, + { + "epoch": 1.7991349701055845, + "grad_norm": 1.5995888710021973, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8741792440414429, + "num_tokens": 539771786.0, + "step": 14143 + }, + { + "epoch": 1.799262180384175, + "grad_norm": 1.5212395191192627, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.859616756439209, + "num_tokens": 539814640.0, + "step": 14144 + }, + { + "epoch": 1.7993893906627656, + "grad_norm": 1.5137193202972412, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8860088586807251, + "num_tokens": 539849874.0, + "step": 14145 + }, + { + "epoch": 1.799516600941356, + "grad_norm": 1.3606637716293335, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8930513858795166, + "num_tokens": 539890548.0, + "step": 14146 + }, + { + "epoch": 1.7996438112199464, + "grad_norm": 1.4541627168655396, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8707743883132935, + "num_tokens": 539931870.0, + "step": 14147 + }, + { + "epoch": 1.799771021498537, + "grad_norm": 1.464417576789856, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8717547059059143, + "num_tokens": 539970847.0, + "step": 14148 + }, + { + "epoch": 1.7998982317771275, + "grad_norm": 1.575488567352295, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8644192218780518, + "num_tokens": 540006843.0, + "step": 14149 + }, + { + "epoch": 1.800025442055718, + "grad_norm": 1.640418291091919, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8817578554153442, + "num_tokens": 540038336.0, + "step": 14150 + }, + { + "epoch": 1.8001526523343085, + "grad_norm": 1.41978120803833, + "learning_rate": 1e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.8973836898803711, + "num_tokens": 540074990.0, + "step": 14151 + }, + { + "epoch": 1.800279862612899, + "grad_norm": 1.4712330102920532, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8670032024383545, + "num_tokens": 540116268.0, + "step": 14152 + }, + { + "epoch": 1.8004070728914896, + "grad_norm": 1.4662755727767944, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8762605786323547, + "num_tokens": 540155356.0, + "step": 14153 + }, + { + "epoch": 1.8005342831700801, + "grad_norm": 1.4257079362869263, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8860818147659302, + "num_tokens": 540197246.0, + "step": 14154 + }, + { + "epoch": 1.8006614934486707, + "grad_norm": 1.4535605907440186, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8742275834083557, + "num_tokens": 540237935.0, + "step": 14155 + }, + { + "epoch": 1.8007887037272612, + "grad_norm": 1.3979116678237915, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.882404088973999, + "num_tokens": 540276023.0, + "step": 14156 + }, + { + "epoch": 1.8009159140058517, + "grad_norm": 1.6362292766571045, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8793562650680542, + "num_tokens": 540308492.0, + "step": 14157 + }, + { + "epoch": 1.8010431242844422, + "grad_norm": 1.6207268238067627, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.895900309085846, + "num_tokens": 540338795.0, + "step": 14158 + }, + { + "epoch": 1.8011703345630328, + "grad_norm": 1.407758355140686, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8762457370758057, + "num_tokens": 540381412.0, + "step": 14159 + }, + { + "epoch": 1.8012975448416233, + "grad_norm": 1.5489569902420044, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8717324137687683, + "num_tokens": 540416969.0, + "step": 14160 + }, + { + "epoch": 1.8014247551202138, + "grad_norm": 1.450875163078308, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8782418966293335, + "num_tokens": 540458330.0, + "step": 14161 + }, + { + "epoch": 1.8015519653988044, + "grad_norm": 1.5058672428131104, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8774600028991699, + "num_tokens": 540497007.0, + "step": 14162 + }, + { + "epoch": 1.8016791756773949, + "grad_norm": 1.4880261421203613, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8682340383529663, + "num_tokens": 540538297.0, + "step": 14163 + }, + { + "epoch": 1.8018063859559852, + "grad_norm": 1.6101850271224976, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8719958066940308, + "num_tokens": 540572466.0, + "step": 14164 + }, + { + "epoch": 1.8019335962345757, + "grad_norm": 1.436210036277771, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8820512294769287, + "num_tokens": 540612155.0, + "step": 14165 + }, + { + "epoch": 1.8020608065131662, + "grad_norm": 1.4244837760925293, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8873192667961121, + "num_tokens": 540651552.0, + "step": 14166 + }, + { + "epoch": 1.8021880167917568, + "grad_norm": 1.6851524114608765, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8684751987457275, + "num_tokens": 540685484.0, + "step": 14167 + }, + { + "epoch": 1.8023152270703473, + "grad_norm": 1.6251164674758911, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8709127306938171, + "num_tokens": 540721273.0, + "step": 14168 + }, + { + "epoch": 1.8024424373489378, + "grad_norm": 1.4875643253326416, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8641350269317627, + "num_tokens": 540763870.0, + "step": 14169 + }, + { + "epoch": 1.8025696476275284, + "grad_norm": 1.6759785413742065, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8708032369613647, + "num_tokens": 540800304.0, + "step": 14170 + }, + { + "epoch": 1.8026968579061187, + "grad_norm": 1.482154130935669, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8700580596923828, + "num_tokens": 540838159.0, + "step": 14171 + }, + { + "epoch": 1.8028240681847092, + "grad_norm": 1.4281988143920898, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.892971396446228, + "num_tokens": 540878761.0, + "step": 14172 + }, + { + "epoch": 1.8029512784632997, + "grad_norm": 1.5603487491607666, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8653076887130737, + "num_tokens": 540916089.0, + "step": 14173 + }, + { + "epoch": 1.8030784887418903, + "grad_norm": 1.566728949546814, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8765074014663696, + "num_tokens": 540953708.0, + "step": 14174 + }, + { + "epoch": 1.8032056990204808, + "grad_norm": 1.5424576997756958, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8753684759140015, + "num_tokens": 540990858.0, + "step": 14175 + }, + { + "epoch": 1.8033329092990713, + "grad_norm": 1.4614684581756592, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8678447008132935, + "num_tokens": 541032192.0, + "step": 14176 + }, + { + "epoch": 1.8034601195776618, + "grad_norm": 1.390123724937439, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.887474536895752, + "num_tokens": 541072198.0, + "step": 14177 + }, + { + "epoch": 1.8035873298562524, + "grad_norm": 1.4558459520339966, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8793359398841858, + "num_tokens": 541113749.0, + "step": 14178 + }, + { + "epoch": 1.803714540134843, + "grad_norm": 1.5475924015045166, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8727775812149048, + "num_tokens": 541149550.0, + "step": 14179 + }, + { + "epoch": 1.8038417504134334, + "grad_norm": 1.558353066444397, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8784633278846741, + "num_tokens": 541183009.0, + "step": 14180 + }, + { + "epoch": 1.803968960692024, + "grad_norm": 1.4577730894088745, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8739955425262451, + "num_tokens": 541222055.0, + "step": 14181 + }, + { + "epoch": 1.8040961709706145, + "grad_norm": 1.502035140991211, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8868695497512817, + "num_tokens": 541256659.0, + "step": 14182 + }, + { + "epoch": 1.804223381249205, + "grad_norm": 1.5658953189849854, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8902764320373535, + "num_tokens": 541287851.0, + "step": 14183 + }, + { + "epoch": 1.8043505915277955, + "grad_norm": 1.4328197240829468, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8863368630409241, + "num_tokens": 541329628.0, + "step": 14184 + }, + { + "epoch": 1.804477801806386, + "grad_norm": 1.4529800415039062, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8798104524612427, + "num_tokens": 541369811.0, + "step": 14185 + }, + { + "epoch": 1.8046050120849766, + "grad_norm": 1.6058465242385864, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8816096782684326, + "num_tokens": 541402711.0, + "step": 14186 + }, + { + "epoch": 1.8047322223635671, + "grad_norm": 1.473920464515686, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8740932941436768, + "num_tokens": 541441767.0, + "step": 14187 + }, + { + "epoch": 1.8048594326421576, + "grad_norm": 1.3989506959915161, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8950519561767578, + "num_tokens": 541480562.0, + "step": 14188 + }, + { + "epoch": 1.804986642920748, + "grad_norm": 1.5902730226516724, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8856794834136963, + "num_tokens": 541512282.0, + "step": 14189 + }, + { + "epoch": 1.8051138531993385, + "grad_norm": 1.4869952201843262, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8756346702575684, + "num_tokens": 541549436.0, + "step": 14190 + }, + { + "epoch": 1.805241063477929, + "grad_norm": 1.6147205829620361, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.873164713382721, + "num_tokens": 541587349.0, + "step": 14191 + }, + { + "epoch": 1.8053682737565195, + "grad_norm": 1.5285955667495728, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8799064755439758, + "num_tokens": 541626097.0, + "step": 14192 + }, + { + "epoch": 1.80549548403511, + "grad_norm": 1.7041105031967163, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8730762600898743, + "num_tokens": 541656888.0, + "step": 14193 + }, + { + "epoch": 1.8056226943137006, + "grad_norm": 1.4697257280349731, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.882867693901062, + "num_tokens": 541695899.0, + "step": 14194 + }, + { + "epoch": 1.805749904592291, + "grad_norm": 1.4353079795837402, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.873010516166687, + "num_tokens": 541736893.0, + "step": 14195 + }, + { + "epoch": 1.8058771148708814, + "grad_norm": 1.5045084953308105, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8860216736793518, + "num_tokens": 541774776.0, + "step": 14196 + }, + { + "epoch": 1.806004325149472, + "grad_norm": 1.426817774772644, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8780350685119629, + "num_tokens": 541817973.0, + "step": 14197 + }, + { + "epoch": 1.8061315354280625, + "grad_norm": 1.4182528257369995, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.876209557056427, + "num_tokens": 541862110.0, + "step": 14198 + }, + { + "epoch": 1.806258745706653, + "grad_norm": 1.3904260396957397, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.895030677318573, + "num_tokens": 541901523.0, + "step": 14199 + }, + { + "epoch": 1.8063859559852435, + "grad_norm": 1.5152703523635864, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8776211142539978, + "num_tokens": 541937760.0, + "step": 14200 + }, + { + "epoch": 1.806513166263834, + "grad_norm": 1.4813123941421509, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8887243270874023, + "num_tokens": 541970663.0, + "step": 14201 + }, + { + "epoch": 1.8066403765424246, + "grad_norm": 1.6021398305892944, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8633396625518799, + "num_tokens": 542012358.0, + "step": 14202 + }, + { + "epoch": 1.8067675868210151, + "grad_norm": 1.6892344951629639, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8787816762924194, + "num_tokens": 542048509.0, + "step": 14203 + }, + { + "epoch": 1.8068947970996057, + "grad_norm": 1.529748558998108, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8623435497283936, + "num_tokens": 542090592.0, + "step": 14204 + }, + { + "epoch": 1.8070220073781962, + "grad_norm": 1.5214805603027344, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.877841591835022, + "num_tokens": 542124712.0, + "step": 14205 + }, + { + "epoch": 1.8071492176567867, + "grad_norm": 1.3271363973617554, + "learning_rate": 1e-06, + "loss": 0.2721, + "mean_token_accuracy": 0.9022068977355957, + "num_tokens": 542165124.0, + "step": 14206 + }, + { + "epoch": 1.8072764279353772, + "grad_norm": 1.497727632522583, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8806101083755493, + "num_tokens": 542201668.0, + "step": 14207 + }, + { + "epoch": 1.8074036382139678, + "grad_norm": 1.6050550937652588, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8763459920883179, + "num_tokens": 542242040.0, + "step": 14208 + }, + { + "epoch": 1.8075308484925583, + "grad_norm": 1.4872032403945923, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8587404489517212, + "num_tokens": 542284857.0, + "step": 14209 + }, + { + "epoch": 1.8076580587711488, + "grad_norm": 1.4511040449142456, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8659136295318604, + "num_tokens": 542325881.0, + "step": 14210 + }, + { + "epoch": 1.8077852690497394, + "grad_norm": 1.4311422109603882, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8684430718421936, + "num_tokens": 542366290.0, + "step": 14211 + }, + { + "epoch": 1.8079124793283299, + "grad_norm": 1.4960187673568726, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8763003349304199, + "num_tokens": 542401526.0, + "step": 14212 + }, + { + "epoch": 1.8080396896069202, + "grad_norm": 1.4768002033233643, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8731138706207275, + "num_tokens": 542442003.0, + "step": 14213 + }, + { + "epoch": 1.8081668998855107, + "grad_norm": 1.644148349761963, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8820112943649292, + "num_tokens": 542472896.0, + "step": 14214 + }, + { + "epoch": 1.8082941101641012, + "grad_norm": 1.5288972854614258, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8667048215866089, + "num_tokens": 542513534.0, + "step": 14215 + }, + { + "epoch": 1.8084213204426918, + "grad_norm": 1.5447529554367065, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8772914409637451, + "num_tokens": 542551566.0, + "step": 14216 + }, + { + "epoch": 1.8085485307212823, + "grad_norm": 1.497148871421814, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8639448285102844, + "num_tokens": 542590443.0, + "step": 14217 + }, + { + "epoch": 1.8086757409998728, + "grad_norm": 1.559627890586853, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8650559186935425, + "num_tokens": 542628304.0, + "step": 14218 + }, + { + "epoch": 1.8088029512784631, + "grad_norm": 1.4443755149841309, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.877026379108429, + "num_tokens": 542670216.0, + "step": 14219 + }, + { + "epoch": 1.8089301615570537, + "grad_norm": 1.5799990892410278, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8660416603088379, + "num_tokens": 542708013.0, + "step": 14220 + }, + { + "epoch": 1.8090573718356442, + "grad_norm": 1.4722113609313965, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8743108510971069, + "num_tokens": 542746903.0, + "step": 14221 + }, + { + "epoch": 1.8091845821142347, + "grad_norm": 1.4239181280136108, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.880530595779419, + "num_tokens": 542787340.0, + "step": 14222 + }, + { + "epoch": 1.8093117923928252, + "grad_norm": 1.5004068613052368, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8713308572769165, + "num_tokens": 542826288.0, + "step": 14223 + }, + { + "epoch": 1.8094390026714158, + "grad_norm": 1.5063344240188599, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8752802610397339, + "num_tokens": 542863248.0, + "step": 14224 + }, + { + "epoch": 1.8095662129500063, + "grad_norm": 1.5989476442337036, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8689984679222107, + "num_tokens": 542898703.0, + "step": 14225 + }, + { + "epoch": 1.8096934232285968, + "grad_norm": 1.41692316532135, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8787358999252319, + "num_tokens": 542937572.0, + "step": 14226 + }, + { + "epoch": 1.8098206335071874, + "grad_norm": 1.5969828367233276, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8777878880500793, + "num_tokens": 542972563.0, + "step": 14227 + }, + { + "epoch": 1.8099478437857779, + "grad_norm": 1.399614930152893, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.8975721597671509, + "num_tokens": 543009523.0, + "step": 14228 + }, + { + "epoch": 1.8100750540643684, + "grad_norm": 1.365486741065979, + "learning_rate": 1e-06, + "loss": 0.2671, + "mean_token_accuracy": 0.9029844403266907, + "num_tokens": 543047364.0, + "step": 14229 + }, + { + "epoch": 1.810202264342959, + "grad_norm": 1.580282211303711, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8828983902931213, + "num_tokens": 543080914.0, + "step": 14230 + }, + { + "epoch": 1.8103294746215495, + "grad_norm": 1.562227487564087, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8697015047073364, + "num_tokens": 543116212.0, + "step": 14231 + }, + { + "epoch": 1.81045668490014, + "grad_norm": 1.4224591255187988, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8820934295654297, + "num_tokens": 543156341.0, + "step": 14232 + }, + { + "epoch": 1.8105838951787305, + "grad_norm": 1.5895360708236694, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8757537007331848, + "num_tokens": 543190576.0, + "step": 14233 + }, + { + "epoch": 1.810711105457321, + "grad_norm": 1.5245903730392456, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8771044611930847, + "num_tokens": 543228714.0, + "step": 14234 + }, + { + "epoch": 1.8108383157359116, + "grad_norm": 1.5209332704544067, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8851203918457031, + "num_tokens": 543266122.0, + "step": 14235 + }, + { + "epoch": 1.8109655260145021, + "grad_norm": 1.5334960222244263, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8770533800125122, + "num_tokens": 543302983.0, + "step": 14236 + }, + { + "epoch": 1.8110927362930926, + "grad_norm": 1.4873175621032715, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8730918169021606, + "num_tokens": 543340862.0, + "step": 14237 + }, + { + "epoch": 1.811219946571683, + "grad_norm": 1.69038724899292, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8699790239334106, + "num_tokens": 543373450.0, + "step": 14238 + }, + { + "epoch": 1.8113471568502735, + "grad_norm": 1.4617902040481567, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8742544651031494, + "num_tokens": 543416312.0, + "step": 14239 + }, + { + "epoch": 1.811474367128864, + "grad_norm": 1.5786364078521729, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8914294242858887, + "num_tokens": 543451329.0, + "step": 14240 + }, + { + "epoch": 1.8116015774074545, + "grad_norm": 1.3968418836593628, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8927246332168579, + "num_tokens": 543492559.0, + "step": 14241 + }, + { + "epoch": 1.811728787686045, + "grad_norm": 1.58250093460083, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8699650168418884, + "num_tokens": 543527842.0, + "step": 14242 + }, + { + "epoch": 1.8118559979646356, + "grad_norm": 1.3416569232940674, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8914386034011841, + "num_tokens": 543567969.0, + "step": 14243 + }, + { + "epoch": 1.811983208243226, + "grad_norm": 1.3656259775161743, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8796212673187256, + "num_tokens": 543612046.0, + "step": 14244 + }, + { + "epoch": 1.8121104185218164, + "grad_norm": 1.6757702827453613, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8688044548034668, + "num_tokens": 543652910.0, + "step": 14245 + }, + { + "epoch": 1.812237628800407, + "grad_norm": 1.542798638343811, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8814305067062378, + "num_tokens": 543687481.0, + "step": 14246 + }, + { + "epoch": 1.8123648390789975, + "grad_norm": 1.576048493385315, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8652138710021973, + "num_tokens": 543724870.0, + "step": 14247 + }, + { + "epoch": 1.812492049357588, + "grad_norm": 1.6377558708190918, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8839884996414185, + "num_tokens": 543757740.0, + "step": 14248 + }, + { + "epoch": 1.8126192596361785, + "grad_norm": 1.5128909349441528, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8807600140571594, + "num_tokens": 543797191.0, + "step": 14249 + }, + { + "epoch": 1.812746469914769, + "grad_norm": 1.557518482208252, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8656786680221558, + "num_tokens": 543836529.0, + "step": 14250 + }, + { + "epoch": 1.8128736801933596, + "grad_norm": 1.4059044122695923, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.880193829536438, + "num_tokens": 543877557.0, + "step": 14251 + }, + { + "epoch": 1.8130008904719501, + "grad_norm": 1.448976755142212, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8699309825897217, + "num_tokens": 543919041.0, + "step": 14252 + }, + { + "epoch": 1.8131281007505406, + "grad_norm": 1.3527545928955078, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8786133527755737, + "num_tokens": 543963846.0, + "step": 14253 + }, + { + "epoch": 1.8132553110291312, + "grad_norm": 1.46039617061615, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8781887888908386, + "num_tokens": 544001139.0, + "step": 14254 + }, + { + "epoch": 1.8133825213077217, + "grad_norm": 1.544459581375122, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8863451480865479, + "num_tokens": 544034442.0, + "step": 14255 + }, + { + "epoch": 1.8135097315863122, + "grad_norm": 1.5156997442245483, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8768847584724426, + "num_tokens": 544073447.0, + "step": 14256 + }, + { + "epoch": 1.8136369418649028, + "grad_norm": 1.3512699604034424, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8774329423904419, + "num_tokens": 544119330.0, + "step": 14257 + }, + { + "epoch": 1.8137641521434933, + "grad_norm": 1.5924315452575684, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8925514817237854, + "num_tokens": 544147930.0, + "step": 14258 + }, + { + "epoch": 1.8138913624220838, + "grad_norm": 1.4982327222824097, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8744021654129028, + "num_tokens": 544184887.0, + "step": 14259 + }, + { + "epoch": 1.8140185727006743, + "grad_norm": 1.379809021949768, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8750609159469604, + "num_tokens": 544227379.0, + "step": 14260 + }, + { + "epoch": 1.8141457829792649, + "grad_norm": 1.4679477214813232, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8801764845848083, + "num_tokens": 544267777.0, + "step": 14261 + }, + { + "epoch": 1.8142729932578552, + "grad_norm": 1.5214924812316895, + "learning_rate": 1e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.8945519924163818, + "num_tokens": 544299348.0, + "step": 14262 + }, + { + "epoch": 1.8144002035364457, + "grad_norm": 1.526510238647461, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8666542768478394, + "num_tokens": 544335469.0, + "step": 14263 + }, + { + "epoch": 1.8145274138150362, + "grad_norm": 1.525651216506958, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8833564519882202, + "num_tokens": 544369944.0, + "step": 14264 + }, + { + "epoch": 1.8146546240936268, + "grad_norm": 1.7601561546325684, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8714505434036255, + "num_tokens": 544406537.0, + "step": 14265 + }, + { + "epoch": 1.8147818343722173, + "grad_norm": 1.6107213497161865, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8565273880958557, + "num_tokens": 544445557.0, + "step": 14266 + }, + { + "epoch": 1.8149090446508078, + "grad_norm": 1.3826661109924316, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8881276845932007, + "num_tokens": 544489537.0, + "step": 14267 + }, + { + "epoch": 1.8150362549293981, + "grad_norm": 1.4808450937271118, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.877720057964325, + "num_tokens": 544528901.0, + "step": 14268 + }, + { + "epoch": 1.8151634652079887, + "grad_norm": 1.4311197996139526, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8807659149169922, + "num_tokens": 544568700.0, + "step": 14269 + }, + { + "epoch": 1.8152906754865792, + "grad_norm": 1.4540512561798096, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8842310309410095, + "num_tokens": 544609228.0, + "step": 14270 + }, + { + "epoch": 1.8154178857651697, + "grad_norm": 1.6023616790771484, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8778063058853149, + "num_tokens": 544646120.0, + "step": 14271 + }, + { + "epoch": 1.8155450960437602, + "grad_norm": 1.494812250137329, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8774783611297607, + "num_tokens": 544683275.0, + "step": 14272 + }, + { + "epoch": 1.8156723063223508, + "grad_norm": 1.3932578563690186, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8844580054283142, + "num_tokens": 544727218.0, + "step": 14273 + }, + { + "epoch": 1.8157995166009413, + "grad_norm": 1.5730390548706055, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8756651282310486, + "num_tokens": 544762969.0, + "step": 14274 + }, + { + "epoch": 1.8159267268795318, + "grad_norm": 1.5050688982009888, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8801773190498352, + "num_tokens": 544800916.0, + "step": 14275 + }, + { + "epoch": 1.8160539371581224, + "grad_norm": 1.6179721355438232, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8783228397369385, + "num_tokens": 544836810.0, + "step": 14276 + }, + { + "epoch": 1.8161811474367129, + "grad_norm": 1.562355399131775, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8808560967445374, + "num_tokens": 544873468.0, + "step": 14277 + }, + { + "epoch": 1.8163083577153034, + "grad_norm": 1.5061932802200317, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8812093138694763, + "num_tokens": 544912086.0, + "step": 14278 + }, + { + "epoch": 1.816435567993894, + "grad_norm": 1.5998910665512085, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8711711764335632, + "num_tokens": 544946452.0, + "step": 14279 + }, + { + "epoch": 1.8165627782724845, + "grad_norm": 1.4798452854156494, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8799095153808594, + "num_tokens": 544985398.0, + "step": 14280 + }, + { + "epoch": 1.816689988551075, + "grad_norm": 1.439659833908081, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.870751142501831, + "num_tokens": 545025854.0, + "step": 14281 + }, + { + "epoch": 1.8168171988296655, + "grad_norm": 1.5601115226745605, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.866791307926178, + "num_tokens": 545061970.0, + "step": 14282 + }, + { + "epoch": 1.816944409108256, + "grad_norm": 1.4907708168029785, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8922098875045776, + "num_tokens": 545097535.0, + "step": 14283 + }, + { + "epoch": 1.8170716193868466, + "grad_norm": 1.5187315940856934, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8818780183792114, + "num_tokens": 545134336.0, + "step": 14284 + }, + { + "epoch": 1.817198829665437, + "grad_norm": 1.5437206029891968, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8694730997085571, + "num_tokens": 545170462.0, + "step": 14285 + }, + { + "epoch": 1.8173260399440276, + "grad_norm": 1.5643846988677979, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8598783612251282, + "num_tokens": 545210008.0, + "step": 14286 + }, + { + "epoch": 1.817453250222618, + "grad_norm": 1.388393759727478, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8933525681495667, + "num_tokens": 545248846.0, + "step": 14287 + }, + { + "epoch": 1.8175804605012085, + "grad_norm": 1.4650461673736572, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8842580914497375, + "num_tokens": 545285878.0, + "step": 14288 + }, + { + "epoch": 1.817707670779799, + "grad_norm": 1.4768742322921753, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8796809911727905, + "num_tokens": 545323984.0, + "step": 14289 + }, + { + "epoch": 1.8178348810583895, + "grad_norm": 1.4384344816207886, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.881637692451477, + "num_tokens": 545365577.0, + "step": 14290 + }, + { + "epoch": 1.81796209133698, + "grad_norm": 1.4440642595291138, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8839514255523682, + "num_tokens": 545403976.0, + "step": 14291 + }, + { + "epoch": 1.8180893016155706, + "grad_norm": 1.545386791229248, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8836407661437988, + "num_tokens": 545440608.0, + "step": 14292 + }, + { + "epoch": 1.818216511894161, + "grad_norm": 1.4763647317886353, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8863036632537842, + "num_tokens": 545481259.0, + "step": 14293 + }, + { + "epoch": 1.8183437221727514, + "grad_norm": 1.6642674207687378, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8647750616073608, + "num_tokens": 545514634.0, + "step": 14294 + }, + { + "epoch": 1.818470932451342, + "grad_norm": 1.5862241983413696, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8724343180656433, + "num_tokens": 545549993.0, + "step": 14295 + }, + { + "epoch": 1.8185981427299325, + "grad_norm": 1.4477671384811401, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8833023309707642, + "num_tokens": 545590022.0, + "step": 14296 + }, + { + "epoch": 1.818725353008523, + "grad_norm": 1.5304839611053467, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.874915361404419, + "num_tokens": 545630338.0, + "step": 14297 + }, + { + "epoch": 1.8188525632871135, + "grad_norm": 1.4082918167114258, + "learning_rate": 1e-06, + "loss": 0.2888, + "mean_token_accuracy": 0.8952692151069641, + "num_tokens": 545666099.0, + "step": 14298 + }, + { + "epoch": 1.818979773565704, + "grad_norm": 1.4328500032424927, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8858112096786499, + "num_tokens": 545706314.0, + "step": 14299 + }, + { + "epoch": 1.8191069838442946, + "grad_norm": 1.4937435388565063, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8853888511657715, + "num_tokens": 545743742.0, + "step": 14300 + }, + { + "epoch": 1.8192341941228851, + "grad_norm": 1.5554397106170654, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8735685348510742, + "num_tokens": 545779588.0, + "step": 14301 + }, + { + "epoch": 1.8193614044014756, + "grad_norm": 1.5698820352554321, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8680900931358337, + "num_tokens": 545816452.0, + "step": 14302 + }, + { + "epoch": 1.8194886146800662, + "grad_norm": 1.4184319972991943, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8804370164871216, + "num_tokens": 545858499.0, + "step": 14303 + }, + { + "epoch": 1.8196158249586567, + "grad_norm": 1.5366237163543701, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8772920370101929, + "num_tokens": 545892910.0, + "step": 14304 + }, + { + "epoch": 1.8197430352372472, + "grad_norm": 1.5126641988754272, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8691612482070923, + "num_tokens": 545932730.0, + "step": 14305 + }, + { + "epoch": 1.8198702455158378, + "grad_norm": 1.4960938692092896, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8802219033241272, + "num_tokens": 545968728.0, + "step": 14306 + }, + { + "epoch": 1.8199974557944283, + "grad_norm": 1.4028520584106445, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8865451216697693, + "num_tokens": 546009205.0, + "step": 14307 + }, + { + "epoch": 1.8201246660730188, + "grad_norm": 1.5568082332611084, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8702258467674255, + "num_tokens": 546045274.0, + "step": 14308 + }, + { + "epoch": 1.8202518763516093, + "grad_norm": 1.5386213064193726, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8788331151008606, + "num_tokens": 546080415.0, + "step": 14309 + }, + { + "epoch": 1.8203790866301999, + "grad_norm": 1.5050365924835205, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8948779106140137, + "num_tokens": 546116041.0, + "step": 14310 + }, + { + "epoch": 1.8205062969087902, + "grad_norm": 1.4517097473144531, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8792648911476135, + "num_tokens": 546155871.0, + "step": 14311 + }, + { + "epoch": 1.8206335071873807, + "grad_norm": 1.5794013738632202, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8609580993652344, + "num_tokens": 546196565.0, + "step": 14312 + }, + { + "epoch": 1.8207607174659712, + "grad_norm": 1.5850292444229126, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8770874738693237, + "num_tokens": 546230560.0, + "step": 14313 + }, + { + "epoch": 1.8208879277445618, + "grad_norm": 1.484582781791687, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.879552960395813, + "num_tokens": 546268087.0, + "step": 14314 + }, + { + "epoch": 1.8210151380231523, + "grad_norm": 1.3442927598953247, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8959170579910278, + "num_tokens": 546309043.0, + "step": 14315 + }, + { + "epoch": 1.8211423483017428, + "grad_norm": 1.394600749015808, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8881778717041016, + "num_tokens": 546346792.0, + "step": 14316 + }, + { + "epoch": 1.8212695585803331, + "grad_norm": 1.5027823448181152, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8778078556060791, + "num_tokens": 546382744.0, + "step": 14317 + }, + { + "epoch": 1.8213967688589237, + "grad_norm": 1.4610310792922974, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8757030367851257, + "num_tokens": 546420983.0, + "step": 14318 + }, + { + "epoch": 1.8215239791375142, + "grad_norm": 1.6549245119094849, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8689257502555847, + "num_tokens": 546452830.0, + "step": 14319 + }, + { + "epoch": 1.8216511894161047, + "grad_norm": 1.3843138217926025, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8706337213516235, + "num_tokens": 546498682.0, + "step": 14320 + }, + { + "epoch": 1.8217783996946952, + "grad_norm": 1.4653834104537964, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8634790182113647, + "num_tokens": 546539949.0, + "step": 14321 + }, + { + "epoch": 1.8219056099732858, + "grad_norm": 1.5347306728363037, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.865389883518219, + "num_tokens": 546579429.0, + "step": 14322 + }, + { + "epoch": 1.8220328202518763, + "grad_norm": 1.602779507637024, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8661320209503174, + "num_tokens": 546613686.0, + "step": 14323 + }, + { + "epoch": 1.8221600305304668, + "grad_norm": 1.4459116458892822, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8779534697532654, + "num_tokens": 546649620.0, + "step": 14324 + }, + { + "epoch": 1.8222872408090574, + "grad_norm": 1.5550674200057983, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.872872531414032, + "num_tokens": 546686437.0, + "step": 14325 + }, + { + "epoch": 1.8224144510876479, + "grad_norm": 1.5165965557098389, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8817970752716064, + "num_tokens": 546727215.0, + "step": 14326 + }, + { + "epoch": 1.8225416613662384, + "grad_norm": 1.5312550067901611, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8974281549453735, + "num_tokens": 546763715.0, + "step": 14327 + }, + { + "epoch": 1.822668871644829, + "grad_norm": 1.4885302782058716, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8669107556343079, + "num_tokens": 546804350.0, + "step": 14328 + }, + { + "epoch": 1.8227960819234195, + "grad_norm": 1.6481220722198486, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8590782284736633, + "num_tokens": 546842227.0, + "step": 14329 + }, + { + "epoch": 1.82292329220201, + "grad_norm": 1.4879472255706787, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8643198609352112, + "num_tokens": 546885703.0, + "step": 14330 + }, + { + "epoch": 1.8230505024806005, + "grad_norm": 1.4559826850891113, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8948391079902649, + "num_tokens": 546922107.0, + "step": 14331 + }, + { + "epoch": 1.823177712759191, + "grad_norm": 1.5326379537582397, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8699231147766113, + "num_tokens": 546960642.0, + "step": 14332 + }, + { + "epoch": 1.8233049230377816, + "grad_norm": 1.5555857419967651, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.868015468120575, + "num_tokens": 546998351.0, + "step": 14333 + }, + { + "epoch": 1.823432133316372, + "grad_norm": 1.4571688175201416, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8803243041038513, + "num_tokens": 547039463.0, + "step": 14334 + }, + { + "epoch": 1.8235593435949626, + "grad_norm": 1.546082854270935, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8637044429779053, + "num_tokens": 547080905.0, + "step": 14335 + }, + { + "epoch": 1.823686553873553, + "grad_norm": 1.4548686742782593, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8825861215591431, + "num_tokens": 547116994.0, + "step": 14336 + }, + { + "epoch": 1.8238137641521435, + "grad_norm": 1.536455512046814, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8664232492446899, + "num_tokens": 547155893.0, + "step": 14337 + }, + { + "epoch": 1.823940974430734, + "grad_norm": 1.4143069982528687, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8728309869766235, + "num_tokens": 547196722.0, + "step": 14338 + }, + { + "epoch": 1.8240681847093245, + "grad_norm": 1.5415140390396118, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8568565845489502, + "num_tokens": 547237723.0, + "step": 14339 + }, + { + "epoch": 1.824195394987915, + "grad_norm": 1.483728051185608, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8720149397850037, + "num_tokens": 547277240.0, + "step": 14340 + }, + { + "epoch": 1.8243226052665056, + "grad_norm": 1.5035320520401, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8763495683670044, + "num_tokens": 547315088.0, + "step": 14341 + }, + { + "epoch": 1.8244498155450959, + "grad_norm": 1.556045651435852, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8649271130561829, + "num_tokens": 547353183.0, + "step": 14342 + }, + { + "epoch": 1.8245770258236864, + "grad_norm": 1.5876004695892334, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.873680830001831, + "num_tokens": 547389678.0, + "step": 14343 + }, + { + "epoch": 1.824704236102277, + "grad_norm": 1.4185969829559326, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8803025484085083, + "num_tokens": 547430529.0, + "step": 14344 + }, + { + "epoch": 1.8248314463808675, + "grad_norm": 1.5347323417663574, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8827770948410034, + "num_tokens": 547464663.0, + "step": 14345 + }, + { + "epoch": 1.824958656659458, + "grad_norm": 1.4621304273605347, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8830094337463379, + "num_tokens": 547502659.0, + "step": 14346 + }, + { + "epoch": 1.8250858669380485, + "grad_norm": 1.2776490449905396, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8883521556854248, + "num_tokens": 547548828.0, + "step": 14347 + }, + { + "epoch": 1.825213077216639, + "grad_norm": 1.619767189025879, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8705973029136658, + "num_tokens": 547584529.0, + "step": 14348 + }, + { + "epoch": 1.8253402874952296, + "grad_norm": 1.5117722749710083, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8757249116897583, + "num_tokens": 547623093.0, + "step": 14349 + }, + { + "epoch": 1.8254674977738201, + "grad_norm": 1.4861277341842651, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8821431398391724, + "num_tokens": 547661141.0, + "step": 14350 + }, + { + "epoch": 1.8255947080524106, + "grad_norm": 1.5948275327682495, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.862025797367096, + "num_tokens": 547698454.0, + "step": 14351 + }, + { + "epoch": 1.8257219183310012, + "grad_norm": 1.4009249210357666, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8668161630630493, + "num_tokens": 547739915.0, + "step": 14352 + }, + { + "epoch": 1.8258491286095917, + "grad_norm": 1.4782037734985352, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8932397961616516, + "num_tokens": 547775025.0, + "step": 14353 + }, + { + "epoch": 1.8259763388881822, + "grad_norm": 1.4885467290878296, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8766810894012451, + "num_tokens": 547808948.0, + "step": 14354 + }, + { + "epoch": 1.8261035491667728, + "grad_norm": 1.310390830039978, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8930473327636719, + "num_tokens": 547852562.0, + "step": 14355 + }, + { + "epoch": 1.8262307594453633, + "grad_norm": 1.5129222869873047, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8795214295387268, + "num_tokens": 547889666.0, + "step": 14356 + }, + { + "epoch": 1.8263579697239538, + "grad_norm": 1.4481236934661865, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8924843668937683, + "num_tokens": 547923095.0, + "step": 14357 + }, + { + "epoch": 1.8264851800025443, + "grad_norm": 1.443568229675293, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8738715648651123, + "num_tokens": 547963549.0, + "step": 14358 + }, + { + "epoch": 1.8266123902811349, + "grad_norm": 1.581634759902954, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8751162886619568, + "num_tokens": 547996440.0, + "step": 14359 + }, + { + "epoch": 1.8267396005597252, + "grad_norm": 1.4398620128631592, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8766162395477295, + "num_tokens": 548034377.0, + "step": 14360 + }, + { + "epoch": 1.8268668108383157, + "grad_norm": 1.3970988988876343, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8851280808448792, + "num_tokens": 548075918.0, + "step": 14361 + }, + { + "epoch": 1.8269940211169062, + "grad_norm": 1.4256713390350342, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8711953163146973, + "num_tokens": 548117436.0, + "step": 14362 + }, + { + "epoch": 1.8271212313954968, + "grad_norm": 1.4256086349487305, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8853802680969238, + "num_tokens": 548159049.0, + "step": 14363 + }, + { + "epoch": 1.8272484416740873, + "grad_norm": 1.3901699781417847, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8698986768722534, + "num_tokens": 548202531.0, + "step": 14364 + }, + { + "epoch": 1.8273756519526778, + "grad_norm": 1.6908568143844604, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8693347573280334, + "num_tokens": 548234707.0, + "step": 14365 + }, + { + "epoch": 1.8275028622312681, + "grad_norm": 1.532361626625061, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8713772296905518, + "num_tokens": 548276256.0, + "step": 14366 + }, + { + "epoch": 1.8276300725098586, + "grad_norm": 1.4837141036987305, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8916240930557251, + "num_tokens": 548311908.0, + "step": 14367 + }, + { + "epoch": 1.8277572827884492, + "grad_norm": 1.5779262781143188, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8704862594604492, + "num_tokens": 548354289.0, + "step": 14368 + }, + { + "epoch": 1.8278844930670397, + "grad_norm": 1.466715931892395, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8781561851501465, + "num_tokens": 548402740.0, + "step": 14369 + }, + { + "epoch": 1.8280117033456302, + "grad_norm": 1.656590461730957, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8807781338691711, + "num_tokens": 548440152.0, + "step": 14370 + }, + { + "epoch": 1.8281389136242208, + "grad_norm": 1.4754198789596558, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8848352432250977, + "num_tokens": 548481820.0, + "step": 14371 + }, + { + "epoch": 1.8282661239028113, + "grad_norm": 1.5226396322250366, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8742460012435913, + "num_tokens": 548523489.0, + "step": 14372 + }, + { + "epoch": 1.8283933341814018, + "grad_norm": 1.7415363788604736, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8784506916999817, + "num_tokens": 548555863.0, + "step": 14373 + }, + { + "epoch": 1.8285205444599923, + "grad_norm": 1.6946030855178833, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8704674243927002, + "num_tokens": 548589182.0, + "step": 14374 + }, + { + "epoch": 1.8286477547385829, + "grad_norm": 1.5463687181472778, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8779187202453613, + "num_tokens": 548623279.0, + "step": 14375 + }, + { + "epoch": 1.8287749650171734, + "grad_norm": 1.514371395111084, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8800455331802368, + "num_tokens": 548660071.0, + "step": 14376 + }, + { + "epoch": 1.828902175295764, + "grad_norm": 1.4171277284622192, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8661978840827942, + "num_tokens": 548701375.0, + "step": 14377 + }, + { + "epoch": 1.8290293855743545, + "grad_norm": 1.3788834810256958, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8802707195281982, + "num_tokens": 548741964.0, + "step": 14378 + }, + { + "epoch": 1.829156595852945, + "grad_norm": 1.40784752368927, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8774656653404236, + "num_tokens": 548784166.0, + "step": 14379 + }, + { + "epoch": 1.8292838061315355, + "grad_norm": 1.7112081050872803, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8790635466575623, + "num_tokens": 548817306.0, + "step": 14380 + }, + { + "epoch": 1.829411016410126, + "grad_norm": 1.7139569520950317, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8618720769882202, + "num_tokens": 548853005.0, + "step": 14381 + }, + { + "epoch": 1.8295382266887166, + "grad_norm": 1.5559616088867188, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8719078302383423, + "num_tokens": 548889753.0, + "step": 14382 + }, + { + "epoch": 1.829665436967307, + "grad_norm": 1.4181450605392456, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8785973191261292, + "num_tokens": 548928513.0, + "step": 14383 + }, + { + "epoch": 1.8297926472458976, + "grad_norm": 1.301450490951538, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.879895806312561, + "num_tokens": 548973837.0, + "step": 14384 + }, + { + "epoch": 1.829919857524488, + "grad_norm": 1.4100871086120605, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8932170867919922, + "num_tokens": 549014161.0, + "step": 14385 + }, + { + "epoch": 1.8300470678030785, + "grad_norm": 1.4130775928497314, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8723383545875549, + "num_tokens": 549059099.0, + "step": 14386 + }, + { + "epoch": 1.830174278081669, + "grad_norm": 1.6574779748916626, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8673341274261475, + "num_tokens": 549092497.0, + "step": 14387 + }, + { + "epoch": 1.8303014883602595, + "grad_norm": 1.5429095029830933, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8657305240631104, + "num_tokens": 549131038.0, + "step": 14388 + }, + { + "epoch": 1.83042869863885, + "grad_norm": 1.6612502336502075, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8967288732528687, + "num_tokens": 549159642.0, + "step": 14389 + }, + { + "epoch": 1.8305559089174406, + "grad_norm": 1.5424590110778809, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8770138025283813, + "num_tokens": 549194569.0, + "step": 14390 + }, + { + "epoch": 1.8306831191960309, + "grad_norm": 1.5921603441238403, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8807380795478821, + "num_tokens": 549228549.0, + "step": 14391 + }, + { + "epoch": 1.8308103294746214, + "grad_norm": 1.5705842971801758, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8679498434066772, + "num_tokens": 549266346.0, + "step": 14392 + }, + { + "epoch": 1.830937539753212, + "grad_norm": 1.5579240322113037, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8857365846633911, + "num_tokens": 549303625.0, + "step": 14393 + }, + { + "epoch": 1.8310647500318025, + "grad_norm": 1.5059738159179688, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8768341541290283, + "num_tokens": 549341561.0, + "step": 14394 + }, + { + "epoch": 1.831191960310393, + "grad_norm": 1.404958724975586, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8833110332489014, + "num_tokens": 549379492.0, + "step": 14395 + }, + { + "epoch": 1.8313191705889835, + "grad_norm": 1.5458706617355347, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8767377138137817, + "num_tokens": 549416202.0, + "step": 14396 + }, + { + "epoch": 1.831446380867574, + "grad_norm": 1.4994826316833496, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8927137851715088, + "num_tokens": 549451197.0, + "step": 14397 + }, + { + "epoch": 1.8315735911461646, + "grad_norm": 1.4501830339431763, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8853499889373779, + "num_tokens": 549492205.0, + "step": 14398 + }, + { + "epoch": 1.831700801424755, + "grad_norm": 1.4063804149627686, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8845961093902588, + "num_tokens": 549532611.0, + "step": 14399 + }, + { + "epoch": 1.8318280117033456, + "grad_norm": 1.4726120233535767, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8834507465362549, + "num_tokens": 549570771.0, + "step": 14400 + }, + { + "epoch": 1.8319552219819362, + "grad_norm": 1.6256427764892578, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8826655745506287, + "num_tokens": 549602539.0, + "step": 14401 + }, + { + "epoch": 1.8320824322605267, + "grad_norm": 1.4268063306808472, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8832918405532837, + "num_tokens": 549642050.0, + "step": 14402 + }, + { + "epoch": 1.8322096425391172, + "grad_norm": 1.3744676113128662, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8911694884300232, + "num_tokens": 549679087.0, + "step": 14403 + }, + { + "epoch": 1.8323368528177078, + "grad_norm": 1.4791902303695679, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8890293836593628, + "num_tokens": 549714623.0, + "step": 14404 + }, + { + "epoch": 1.8324640630962983, + "grad_norm": 1.5394233465194702, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8685280084609985, + "num_tokens": 549753213.0, + "step": 14405 + }, + { + "epoch": 1.8325912733748888, + "grad_norm": 1.4540824890136719, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8886920809745789, + "num_tokens": 549790946.0, + "step": 14406 + }, + { + "epoch": 1.8327184836534793, + "grad_norm": 1.4189424514770508, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8833521604537964, + "num_tokens": 549830901.0, + "step": 14407 + }, + { + "epoch": 1.8328456939320699, + "grad_norm": 1.4397413730621338, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8876009583473206, + "num_tokens": 549870259.0, + "step": 14408 + }, + { + "epoch": 1.8329729042106602, + "grad_norm": 1.6992309093475342, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8826444149017334, + "num_tokens": 549902384.0, + "step": 14409 + }, + { + "epoch": 1.8331001144892507, + "grad_norm": 1.543744444847107, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8783998489379883, + "num_tokens": 549939429.0, + "step": 14410 + }, + { + "epoch": 1.8332273247678412, + "grad_norm": 1.6895250082015991, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8841471672058105, + "num_tokens": 549970149.0, + "step": 14411 + }, + { + "epoch": 1.8333545350464318, + "grad_norm": 1.421177625656128, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8917348980903625, + "num_tokens": 550009348.0, + "step": 14412 + }, + { + "epoch": 1.8334817453250223, + "grad_norm": 1.5272175073623657, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8551362752914429, + "num_tokens": 550052360.0, + "step": 14413 + }, + { + "epoch": 1.8336089556036128, + "grad_norm": 1.4761959314346313, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8745484352111816, + "num_tokens": 550092356.0, + "step": 14414 + }, + { + "epoch": 1.8337361658822031, + "grad_norm": 1.4142019748687744, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8769014477729797, + "num_tokens": 550131534.0, + "step": 14415 + }, + { + "epoch": 1.8338633761607936, + "grad_norm": 1.6103841066360474, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.858466625213623, + "num_tokens": 550170333.0, + "step": 14416 + }, + { + "epoch": 1.8339905864393842, + "grad_norm": 1.5154588222503662, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8769645690917969, + "num_tokens": 550206170.0, + "step": 14417 + }, + { + "epoch": 1.8341177967179747, + "grad_norm": 1.428130030632019, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8746277093887329, + "num_tokens": 550245312.0, + "step": 14418 + }, + { + "epoch": 1.8342450069965652, + "grad_norm": 1.5242934226989746, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8687043190002441, + "num_tokens": 550284598.0, + "step": 14419 + }, + { + "epoch": 1.8343722172751558, + "grad_norm": 1.5129178762435913, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8785257935523987, + "num_tokens": 550320703.0, + "step": 14420 + }, + { + "epoch": 1.8344994275537463, + "grad_norm": 1.4182275533676147, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.874351441860199, + "num_tokens": 550362316.0, + "step": 14421 + }, + { + "epoch": 1.8346266378323368, + "grad_norm": 1.5001955032348633, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8707365989685059, + "num_tokens": 550403504.0, + "step": 14422 + }, + { + "epoch": 1.8347538481109273, + "grad_norm": 1.4291415214538574, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8867942094802856, + "num_tokens": 550440402.0, + "step": 14423 + }, + { + "epoch": 1.8348810583895179, + "grad_norm": 1.3465907573699951, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.88875412940979, + "num_tokens": 550480174.0, + "step": 14424 + }, + { + "epoch": 1.8350082686681084, + "grad_norm": 1.5890709161758423, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8833901286125183, + "num_tokens": 550513760.0, + "step": 14425 + }, + { + "epoch": 1.835135478946699, + "grad_norm": 1.626408576965332, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8699004054069519, + "num_tokens": 550548088.0, + "step": 14426 + }, + { + "epoch": 1.8352626892252895, + "grad_norm": 1.5421559810638428, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8793609142303467, + "num_tokens": 550582770.0, + "step": 14427 + }, + { + "epoch": 1.83538989950388, + "grad_norm": 1.5206449031829834, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8755162954330444, + "num_tokens": 550619519.0, + "step": 14428 + }, + { + "epoch": 1.8355171097824705, + "grad_norm": 1.4775581359863281, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8676885366439819, + "num_tokens": 550663770.0, + "step": 14429 + }, + { + "epoch": 1.835644320061061, + "grad_norm": 1.500357985496521, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8848949670791626, + "num_tokens": 550698789.0, + "step": 14430 + }, + { + "epoch": 1.8357715303396516, + "grad_norm": 1.407670021057129, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8877328038215637, + "num_tokens": 550739472.0, + "step": 14431 + }, + { + "epoch": 1.835898740618242, + "grad_norm": 1.4449695348739624, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8898435831069946, + "num_tokens": 550778322.0, + "step": 14432 + }, + { + "epoch": 1.8360259508968326, + "grad_norm": 1.4224331378936768, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8843164443969727, + "num_tokens": 550819611.0, + "step": 14433 + }, + { + "epoch": 1.836153161175423, + "grad_norm": 1.4463454484939575, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8735413551330566, + "num_tokens": 550859909.0, + "step": 14434 + }, + { + "epoch": 1.8362803714540135, + "grad_norm": 1.5523489713668823, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8857406973838806, + "num_tokens": 550898897.0, + "step": 14435 + }, + { + "epoch": 1.836407581732604, + "grad_norm": 1.5251601934432983, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8834757208824158, + "num_tokens": 550933272.0, + "step": 14436 + }, + { + "epoch": 1.8365347920111945, + "grad_norm": 1.5419881343841553, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8685221672058105, + "num_tokens": 550971099.0, + "step": 14437 + }, + { + "epoch": 1.836662002289785, + "grad_norm": 1.4663034677505493, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8734522461891174, + "num_tokens": 551009011.0, + "step": 14438 + }, + { + "epoch": 1.8367892125683756, + "grad_norm": 1.5143994092941284, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8795549869537354, + "num_tokens": 551044696.0, + "step": 14439 + }, + { + "epoch": 1.8369164228469659, + "grad_norm": 1.4782177209854126, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8740407228469849, + "num_tokens": 551082746.0, + "step": 14440 + }, + { + "epoch": 1.8370436331255564, + "grad_norm": 1.4406256675720215, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8805223107337952, + "num_tokens": 551122039.0, + "step": 14441 + }, + { + "epoch": 1.837170843404147, + "grad_norm": 1.5355219841003418, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8787182569503784, + "num_tokens": 551158003.0, + "step": 14442 + }, + { + "epoch": 1.8372980536827375, + "grad_norm": 1.5317332744598389, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8839457035064697, + "num_tokens": 551193601.0, + "step": 14443 + }, + { + "epoch": 1.837425263961328, + "grad_norm": 1.5357121229171753, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8814793825149536, + "num_tokens": 551238249.0, + "step": 14444 + }, + { + "epoch": 1.8375524742399185, + "grad_norm": 1.397052526473999, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8881100416183472, + "num_tokens": 551276696.0, + "step": 14445 + }, + { + "epoch": 1.837679684518509, + "grad_norm": 1.3514975309371948, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8868850469589233, + "num_tokens": 551315578.0, + "step": 14446 + }, + { + "epoch": 1.8378068947970996, + "grad_norm": 1.4258830547332764, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8823956251144409, + "num_tokens": 551355373.0, + "step": 14447 + }, + { + "epoch": 1.83793410507569, + "grad_norm": 1.523712396621704, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8715176582336426, + "num_tokens": 551392575.0, + "step": 14448 + }, + { + "epoch": 1.8380613153542806, + "grad_norm": 1.5377527475357056, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.879593014717102, + "num_tokens": 551428156.0, + "step": 14449 + }, + { + "epoch": 1.8381885256328712, + "grad_norm": 1.4607973098754883, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8731397390365601, + "num_tokens": 551470036.0, + "step": 14450 + }, + { + "epoch": 1.8383157359114617, + "grad_norm": 1.5074063539505005, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8659756779670715, + "num_tokens": 551509088.0, + "step": 14451 + }, + { + "epoch": 1.8384429461900522, + "grad_norm": 1.5071145296096802, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8770059943199158, + "num_tokens": 551544671.0, + "step": 14452 + }, + { + "epoch": 1.8385701564686427, + "grad_norm": 1.3563578128814697, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8736760020256042, + "num_tokens": 551588071.0, + "step": 14453 + }, + { + "epoch": 1.8386973667472333, + "grad_norm": 1.5245318412780762, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8897716999053955, + "num_tokens": 551624060.0, + "step": 14454 + }, + { + "epoch": 1.8388245770258238, + "grad_norm": 1.4912840127944946, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8805825710296631, + "num_tokens": 551660512.0, + "step": 14455 + }, + { + "epoch": 1.8389517873044143, + "grad_norm": 1.463749647140503, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8824054002761841, + "num_tokens": 551701533.0, + "step": 14456 + }, + { + "epoch": 1.8390789975830049, + "grad_norm": 1.4516971111297607, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8823224306106567, + "num_tokens": 551742478.0, + "step": 14457 + }, + { + "epoch": 1.8392062078615952, + "grad_norm": 1.4360164403915405, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8652955293655396, + "num_tokens": 551785920.0, + "step": 14458 + }, + { + "epoch": 1.8393334181401857, + "grad_norm": 1.4781968593597412, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8733378648757935, + "num_tokens": 551823683.0, + "step": 14459 + }, + { + "epoch": 1.8394606284187762, + "grad_norm": 1.5188498497009277, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8704029321670532, + "num_tokens": 551862729.0, + "step": 14460 + }, + { + "epoch": 1.8395878386973668, + "grad_norm": 1.4864914417266846, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8825628161430359, + "num_tokens": 551900924.0, + "step": 14461 + }, + { + "epoch": 1.8397150489759573, + "grad_norm": 1.605989933013916, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8706650137901306, + "num_tokens": 551938391.0, + "step": 14462 + }, + { + "epoch": 1.8398422592545478, + "grad_norm": 1.4428521394729614, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8832021951675415, + "num_tokens": 551977328.0, + "step": 14463 + }, + { + "epoch": 1.8399694695331381, + "grad_norm": 1.4079838991165161, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8722943067550659, + "num_tokens": 552021867.0, + "step": 14464 + }, + { + "epoch": 1.8400966798117286, + "grad_norm": 1.5518171787261963, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8746406435966492, + "num_tokens": 552060187.0, + "step": 14465 + }, + { + "epoch": 1.8402238900903192, + "grad_norm": 1.3318015336990356, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8685738444328308, + "num_tokens": 552105314.0, + "step": 14466 + }, + { + "epoch": 1.8403511003689097, + "grad_norm": 1.5483858585357666, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8789612054824829, + "num_tokens": 552143182.0, + "step": 14467 + }, + { + "epoch": 1.8404783106475002, + "grad_norm": 1.5920960903167725, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8837342262268066, + "num_tokens": 552175114.0, + "step": 14468 + }, + { + "epoch": 1.8406055209260908, + "grad_norm": 1.4874216318130493, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8767572641372681, + "num_tokens": 552214310.0, + "step": 14469 + }, + { + "epoch": 1.8407327312046813, + "grad_norm": 1.4176079034805298, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8813759088516235, + "num_tokens": 552254034.0, + "step": 14470 + }, + { + "epoch": 1.8408599414832718, + "grad_norm": 1.5062446594238281, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8773304224014282, + "num_tokens": 552289683.0, + "step": 14471 + }, + { + "epoch": 1.8409871517618623, + "grad_norm": 1.6349891424179077, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8643702268600464, + "num_tokens": 552324530.0, + "step": 14472 + }, + { + "epoch": 1.8411143620404529, + "grad_norm": 1.6313393115997314, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8780570030212402, + "num_tokens": 552360398.0, + "step": 14473 + }, + { + "epoch": 1.8412415723190434, + "grad_norm": 1.4606215953826904, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8812525272369385, + "num_tokens": 552398997.0, + "step": 14474 + }, + { + "epoch": 1.841368782597634, + "grad_norm": 1.4534906148910522, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8851825594902039, + "num_tokens": 552437024.0, + "step": 14475 + }, + { + "epoch": 1.8414959928762245, + "grad_norm": 1.4637490510940552, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8823394179344177, + "num_tokens": 552474564.0, + "step": 14476 + }, + { + "epoch": 1.841623203154815, + "grad_norm": 1.554949164390564, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8717941045761108, + "num_tokens": 552513034.0, + "step": 14477 + }, + { + "epoch": 1.8417504134334055, + "grad_norm": 1.6258845329284668, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8753949999809265, + "num_tokens": 552544694.0, + "step": 14478 + }, + { + "epoch": 1.841877623711996, + "grad_norm": 1.5488327741622925, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8899680376052856, + "num_tokens": 552579402.0, + "step": 14479 + }, + { + "epoch": 1.8420048339905866, + "grad_norm": 1.449116826057434, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8767624497413635, + "num_tokens": 552618086.0, + "step": 14480 + }, + { + "epoch": 1.842132044269177, + "grad_norm": 1.4232611656188965, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8825567960739136, + "num_tokens": 552655946.0, + "step": 14481 + }, + { + "epoch": 1.8422592545477676, + "grad_norm": 1.5158612728118896, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8690156936645508, + "num_tokens": 552693719.0, + "step": 14482 + }, + { + "epoch": 1.842386464826358, + "grad_norm": 1.3634313344955444, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8894954919815063, + "num_tokens": 552733626.0, + "step": 14483 + }, + { + "epoch": 1.8425136751049485, + "grad_norm": 1.5440012216567993, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.882020115852356, + "num_tokens": 552768063.0, + "step": 14484 + }, + { + "epoch": 1.842640885383539, + "grad_norm": 1.5709048509597778, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8719539046287537, + "num_tokens": 552805014.0, + "step": 14485 + }, + { + "epoch": 1.8427680956621295, + "grad_norm": 1.4873679876327515, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8779470920562744, + "num_tokens": 552844942.0, + "step": 14486 + }, + { + "epoch": 1.84289530594072, + "grad_norm": 1.508910059928894, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8767761588096619, + "num_tokens": 552882047.0, + "step": 14487 + }, + { + "epoch": 1.8430225162193106, + "grad_norm": 1.5620733499526978, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8484205007553101, + "num_tokens": 552924676.0, + "step": 14488 + }, + { + "epoch": 1.8431497264979009, + "grad_norm": 1.6250643730163574, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8949649333953857, + "num_tokens": 552958040.0, + "step": 14489 + }, + { + "epoch": 1.8432769367764914, + "grad_norm": 1.5723053216934204, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8747839331626892, + "num_tokens": 552991451.0, + "step": 14490 + }, + { + "epoch": 1.843404147055082, + "grad_norm": 1.2742208242416382, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8839927911758423, + "num_tokens": 553035799.0, + "step": 14491 + }, + { + "epoch": 1.8435313573336725, + "grad_norm": 1.384092926979065, + "learning_rate": 1e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.8942455053329468, + "num_tokens": 553074456.0, + "step": 14492 + }, + { + "epoch": 1.843658567612263, + "grad_norm": 1.5543731451034546, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8819785118103027, + "num_tokens": 553107216.0, + "step": 14493 + }, + { + "epoch": 1.8437857778908535, + "grad_norm": 1.3324838876724243, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8840848207473755, + "num_tokens": 553150558.0, + "step": 14494 + }, + { + "epoch": 1.843912988169444, + "grad_norm": 1.542291522026062, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8776149749755859, + "num_tokens": 553184565.0, + "step": 14495 + }, + { + "epoch": 1.8440401984480346, + "grad_norm": 1.5182889699935913, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8776769042015076, + "num_tokens": 553219830.0, + "step": 14496 + }, + { + "epoch": 1.844167408726625, + "grad_norm": 1.2913023233413696, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8774498105049133, + "num_tokens": 553266490.0, + "step": 14497 + }, + { + "epoch": 1.8442946190052156, + "grad_norm": 1.4455989599227905, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.875983476638794, + "num_tokens": 553307890.0, + "step": 14498 + }, + { + "epoch": 1.8444218292838062, + "grad_norm": 1.4288005828857422, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8772053122520447, + "num_tokens": 553350548.0, + "step": 14499 + }, + { + "epoch": 1.8445490395623967, + "grad_norm": 1.5978593826293945, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8687595129013062, + "num_tokens": 553387859.0, + "step": 14500 + }, + { + "epoch": 1.8446762498409872, + "grad_norm": 1.5026429891586304, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8803715705871582, + "num_tokens": 553426283.0, + "step": 14501 + }, + { + "epoch": 1.8448034601195777, + "grad_norm": 1.4821314811706543, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8847957849502563, + "num_tokens": 553465456.0, + "step": 14502 + }, + { + "epoch": 1.8449306703981683, + "grad_norm": 1.4518048763275146, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8809828162193298, + "num_tokens": 553503321.0, + "step": 14503 + }, + { + "epoch": 1.8450578806767588, + "grad_norm": 1.506503701210022, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.885734498500824, + "num_tokens": 553540932.0, + "step": 14504 + }, + { + "epoch": 1.8451850909553493, + "grad_norm": 1.5159608125686646, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8822790384292603, + "num_tokens": 553578487.0, + "step": 14505 + }, + { + "epoch": 1.8453123012339399, + "grad_norm": 1.4439126253128052, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8777881860733032, + "num_tokens": 553620964.0, + "step": 14506 + }, + { + "epoch": 1.8454395115125302, + "grad_norm": 1.3729183673858643, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8818747997283936, + "num_tokens": 553665661.0, + "step": 14507 + }, + { + "epoch": 1.8455667217911207, + "grad_norm": 1.5022523403167725, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8721088767051697, + "num_tokens": 553704271.0, + "step": 14508 + }, + { + "epoch": 1.8456939320697112, + "grad_norm": 1.6002460718154907, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8717115521430969, + "num_tokens": 553744584.0, + "step": 14509 + }, + { + "epoch": 1.8458211423483017, + "grad_norm": 1.7744100093841553, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.87504643201828, + "num_tokens": 553776074.0, + "step": 14510 + }, + { + "epoch": 1.8459483526268923, + "grad_norm": 1.4467540979385376, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8730243444442749, + "num_tokens": 553817375.0, + "step": 14511 + }, + { + "epoch": 1.8460755629054828, + "grad_norm": 1.553891897201538, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8674045205116272, + "num_tokens": 553856112.0, + "step": 14512 + }, + { + "epoch": 1.846202773184073, + "grad_norm": 1.5332177877426147, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8844521045684814, + "num_tokens": 553895193.0, + "step": 14513 + }, + { + "epoch": 1.8463299834626636, + "grad_norm": 1.5361647605895996, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8538398146629333, + "num_tokens": 553936044.0, + "step": 14514 + }, + { + "epoch": 1.8464571937412542, + "grad_norm": 1.447411298751831, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8656494617462158, + "num_tokens": 553976461.0, + "step": 14515 + }, + { + "epoch": 1.8465844040198447, + "grad_norm": 1.4838947057724, + "learning_rate": 1e-06, + "loss": 0.278, + "mean_token_accuracy": 0.9009109139442444, + "num_tokens": 554010838.0, + "step": 14516 + }, + { + "epoch": 1.8467116142984352, + "grad_norm": 1.606980800628662, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8662493228912354, + "num_tokens": 554044230.0, + "step": 14517 + }, + { + "epoch": 1.8468388245770258, + "grad_norm": 1.4470092058181763, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8820991516113281, + "num_tokens": 554080730.0, + "step": 14518 + }, + { + "epoch": 1.8469660348556163, + "grad_norm": 1.4524343013763428, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8628320693969727, + "num_tokens": 554122518.0, + "step": 14519 + }, + { + "epoch": 1.8470932451342068, + "grad_norm": 1.5982638597488403, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8665297627449036, + "num_tokens": 554162107.0, + "step": 14520 + }, + { + "epoch": 1.8472204554127973, + "grad_norm": 1.4223169088363647, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8824554681777954, + "num_tokens": 554203194.0, + "step": 14521 + }, + { + "epoch": 1.8473476656913879, + "grad_norm": 1.4412380456924438, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.888827383518219, + "num_tokens": 554239391.0, + "step": 14522 + }, + { + "epoch": 1.8474748759699784, + "grad_norm": 1.5889122486114502, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8713774085044861, + "num_tokens": 554274199.0, + "step": 14523 + }, + { + "epoch": 1.847602086248569, + "grad_norm": 1.5920138359069824, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8710988759994507, + "num_tokens": 554314349.0, + "step": 14524 + }, + { + "epoch": 1.8477292965271594, + "grad_norm": 1.4189770221710205, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8826171159744263, + "num_tokens": 554353760.0, + "step": 14525 + }, + { + "epoch": 1.84785650680575, + "grad_norm": 1.5412890911102295, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8881579041481018, + "num_tokens": 554389594.0, + "step": 14526 + }, + { + "epoch": 1.8479837170843405, + "grad_norm": 1.5315263271331787, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8784140348434448, + "num_tokens": 554424918.0, + "step": 14527 + }, + { + "epoch": 1.848110927362931, + "grad_norm": 1.6170564889907837, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8744520545005798, + "num_tokens": 554459790.0, + "step": 14528 + }, + { + "epoch": 1.8482381376415216, + "grad_norm": 1.440443992614746, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8862488269805908, + "num_tokens": 554496789.0, + "step": 14529 + }, + { + "epoch": 1.848365347920112, + "grad_norm": 1.583727240562439, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8709481954574585, + "num_tokens": 554529725.0, + "step": 14530 + }, + { + "epoch": 1.8484925581987026, + "grad_norm": 1.5846558809280396, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8795334100723267, + "num_tokens": 554563234.0, + "step": 14531 + }, + { + "epoch": 1.848619768477293, + "grad_norm": 1.5883132219314575, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.875691294670105, + "num_tokens": 554598069.0, + "step": 14532 + }, + { + "epoch": 1.8487469787558835, + "grad_norm": 1.5050162076950073, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.864699125289917, + "num_tokens": 554639952.0, + "step": 14533 + }, + { + "epoch": 1.848874189034474, + "grad_norm": 1.5258830785751343, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8777298331260681, + "num_tokens": 554676333.0, + "step": 14534 + }, + { + "epoch": 1.8490013993130645, + "grad_norm": 1.3771662712097168, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8825193643569946, + "num_tokens": 554718688.0, + "step": 14535 + }, + { + "epoch": 1.849128609591655, + "grad_norm": 1.3850769996643066, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8838511109352112, + "num_tokens": 554757780.0, + "step": 14536 + }, + { + "epoch": 1.8492558198702456, + "grad_norm": 1.4666264057159424, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8816659450531006, + "num_tokens": 554796437.0, + "step": 14537 + }, + { + "epoch": 1.8493830301488359, + "grad_norm": 1.4020137786865234, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8871955871582031, + "num_tokens": 554834006.0, + "step": 14538 + }, + { + "epoch": 1.8495102404274264, + "grad_norm": 1.5225170850753784, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8730734586715698, + "num_tokens": 554870631.0, + "step": 14539 + }, + { + "epoch": 1.849637450706017, + "grad_norm": 1.4692399501800537, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8799887895584106, + "num_tokens": 554912485.0, + "step": 14540 + }, + { + "epoch": 1.8497646609846075, + "grad_norm": 1.4091535806655884, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8753788471221924, + "num_tokens": 554951907.0, + "step": 14541 + }, + { + "epoch": 1.849891871263198, + "grad_norm": 1.5648733377456665, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8710572123527527, + "num_tokens": 554985364.0, + "step": 14542 + }, + { + "epoch": 1.8500190815417885, + "grad_norm": 1.4360345602035522, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.876693069934845, + "num_tokens": 555024471.0, + "step": 14543 + }, + { + "epoch": 1.850146291820379, + "grad_norm": 1.6367007493972778, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8682498931884766, + "num_tokens": 555058732.0, + "step": 14544 + }, + { + "epoch": 1.8502735020989696, + "grad_norm": 1.4308240413665771, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8696105480194092, + "num_tokens": 555102083.0, + "step": 14545 + }, + { + "epoch": 1.85040071237756, + "grad_norm": 1.493330955505371, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8812872171401978, + "num_tokens": 555137649.0, + "step": 14546 + }, + { + "epoch": 1.8505279226561506, + "grad_norm": 1.5410184860229492, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8762398362159729, + "num_tokens": 555175716.0, + "step": 14547 + }, + { + "epoch": 1.8506551329347412, + "grad_norm": 1.4944173097610474, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8822983503341675, + "num_tokens": 555214971.0, + "step": 14548 + }, + { + "epoch": 1.8507823432133317, + "grad_norm": 1.6115713119506836, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8792887926101685, + "num_tokens": 555249548.0, + "step": 14549 + }, + { + "epoch": 1.8509095534919222, + "grad_norm": 1.4597549438476562, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8801586031913757, + "num_tokens": 555287870.0, + "step": 14550 + }, + { + "epoch": 1.8510367637705127, + "grad_norm": 1.4850265979766846, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8870159387588501, + "num_tokens": 555321412.0, + "step": 14551 + }, + { + "epoch": 1.8511639740491033, + "grad_norm": 1.5717462301254272, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8770517706871033, + "num_tokens": 555354350.0, + "step": 14552 + }, + { + "epoch": 1.8512911843276938, + "grad_norm": 1.4398208856582642, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8798962235450745, + "num_tokens": 555393637.0, + "step": 14553 + }, + { + "epoch": 1.8514183946062843, + "grad_norm": 1.5389989614486694, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.866073489189148, + "num_tokens": 555429174.0, + "step": 14554 + }, + { + "epoch": 1.8515456048848749, + "grad_norm": 1.5616748332977295, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8856094479560852, + "num_tokens": 555465262.0, + "step": 14555 + }, + { + "epoch": 1.8516728151634652, + "grad_norm": 1.546797752380371, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8610022664070129, + "num_tokens": 555504429.0, + "step": 14556 + }, + { + "epoch": 1.8518000254420557, + "grad_norm": 1.4689860343933105, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8634331226348877, + "num_tokens": 555545711.0, + "step": 14557 + }, + { + "epoch": 1.8519272357206462, + "grad_norm": 1.501727819442749, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8721506595611572, + "num_tokens": 555583956.0, + "step": 14558 + }, + { + "epoch": 1.8520544459992367, + "grad_norm": 1.5790133476257324, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8651618957519531, + "num_tokens": 555622615.0, + "step": 14559 + }, + { + "epoch": 1.8521816562778273, + "grad_norm": 1.5363236665725708, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8748358488082886, + "num_tokens": 555659004.0, + "step": 14560 + }, + { + "epoch": 1.8523088665564178, + "grad_norm": 1.5427751541137695, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8744935989379883, + "num_tokens": 555693281.0, + "step": 14561 + }, + { + "epoch": 1.852436076835008, + "grad_norm": 1.4330930709838867, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8701151609420776, + "num_tokens": 555734831.0, + "step": 14562 + }, + { + "epoch": 1.8525632871135986, + "grad_norm": 1.408692479133606, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8796889781951904, + "num_tokens": 555774208.0, + "step": 14563 + }, + { + "epoch": 1.8526904973921892, + "grad_norm": 1.6003446578979492, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8788057565689087, + "num_tokens": 555807757.0, + "step": 14564 + }, + { + "epoch": 1.8528177076707797, + "grad_norm": 1.5066770315170288, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8654534220695496, + "num_tokens": 555848329.0, + "step": 14565 + }, + { + "epoch": 1.8529449179493702, + "grad_norm": 1.4992642402648926, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8888236284255981, + "num_tokens": 555884855.0, + "step": 14566 + }, + { + "epoch": 1.8530721282279607, + "grad_norm": 1.601605772972107, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.877886176109314, + "num_tokens": 555920591.0, + "step": 14567 + }, + { + "epoch": 1.8531993385065513, + "grad_norm": 1.5817160606384277, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8904316425323486, + "num_tokens": 555953790.0, + "step": 14568 + }, + { + "epoch": 1.8533265487851418, + "grad_norm": 1.4937690496444702, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.887060821056366, + "num_tokens": 555991686.0, + "step": 14569 + }, + { + "epoch": 1.8534537590637323, + "grad_norm": 1.4781290292739868, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8723325729370117, + "num_tokens": 556031138.0, + "step": 14570 + }, + { + "epoch": 1.8535809693423229, + "grad_norm": 1.6321473121643066, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8685879111289978, + "num_tokens": 556068567.0, + "step": 14571 + }, + { + "epoch": 1.8537081796209134, + "grad_norm": 1.4114748239517212, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8786788582801819, + "num_tokens": 556107014.0, + "step": 14572 + }, + { + "epoch": 1.853835389899504, + "grad_norm": 1.459660530090332, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8813244104385376, + "num_tokens": 556149021.0, + "step": 14573 + }, + { + "epoch": 1.8539626001780944, + "grad_norm": 1.6186881065368652, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8746992349624634, + "num_tokens": 556185515.0, + "step": 14574 + }, + { + "epoch": 1.854089810456685, + "grad_norm": 1.459270715713501, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8674627542495728, + "num_tokens": 556225749.0, + "step": 14575 + }, + { + "epoch": 1.8542170207352755, + "grad_norm": 1.5233256816864014, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8822197914123535, + "num_tokens": 556261989.0, + "step": 14576 + }, + { + "epoch": 1.854344231013866, + "grad_norm": 1.6395190954208374, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8798612356185913, + "num_tokens": 556294044.0, + "step": 14577 + }, + { + "epoch": 1.8544714412924566, + "grad_norm": 1.473440408706665, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8662840127944946, + "num_tokens": 556333068.0, + "step": 14578 + }, + { + "epoch": 1.854598651571047, + "grad_norm": 1.4551403522491455, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8880554437637329, + "num_tokens": 556371297.0, + "step": 14579 + }, + { + "epoch": 1.8547258618496376, + "grad_norm": 1.5217703580856323, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8601915836334229, + "num_tokens": 556411770.0, + "step": 14580 + }, + { + "epoch": 1.854853072128228, + "grad_norm": 1.5883820056915283, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8762776851654053, + "num_tokens": 556447823.0, + "step": 14581 + }, + { + "epoch": 1.8549802824068184, + "grad_norm": 1.3317515850067139, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8801320791244507, + "num_tokens": 556490369.0, + "step": 14582 + }, + { + "epoch": 1.855107492685409, + "grad_norm": 1.4859366416931152, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8931736946105957, + "num_tokens": 556523640.0, + "step": 14583 + }, + { + "epoch": 1.8552347029639995, + "grad_norm": 1.5127527713775635, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8693894743919373, + "num_tokens": 556565185.0, + "step": 14584 + }, + { + "epoch": 1.85536191324259, + "grad_norm": 1.6658462285995483, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8855289220809937, + "num_tokens": 556596951.0, + "step": 14585 + }, + { + "epoch": 1.8554891235211806, + "grad_norm": 1.5222375392913818, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8709348440170288, + "num_tokens": 556634992.0, + "step": 14586 + }, + { + "epoch": 1.8556163337997709, + "grad_norm": 1.4622489213943481, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.873428225517273, + "num_tokens": 556678762.0, + "step": 14587 + }, + { + "epoch": 1.8557435440783614, + "grad_norm": 1.4221397638320923, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8820154666900635, + "num_tokens": 556717470.0, + "step": 14588 + }, + { + "epoch": 1.855870754356952, + "grad_norm": 1.496250033378601, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.88343346118927, + "num_tokens": 556754391.0, + "step": 14589 + }, + { + "epoch": 1.8559979646355425, + "grad_norm": 1.4670206308364868, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8772972226142883, + "num_tokens": 556793696.0, + "step": 14590 + }, + { + "epoch": 1.856125174914133, + "grad_norm": 1.5060391426086426, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8673042058944702, + "num_tokens": 556834335.0, + "step": 14591 + }, + { + "epoch": 1.8562523851927235, + "grad_norm": 1.5299663543701172, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8847320675849915, + "num_tokens": 556870886.0, + "step": 14592 + }, + { + "epoch": 1.856379595471314, + "grad_norm": 1.447089672088623, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8756437301635742, + "num_tokens": 556910590.0, + "step": 14593 + }, + { + "epoch": 1.8565068057499046, + "grad_norm": 1.5467936992645264, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8777984380722046, + "num_tokens": 556944862.0, + "step": 14594 + }, + { + "epoch": 1.856634016028495, + "grad_norm": 1.6224792003631592, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8738072514533997, + "num_tokens": 556980805.0, + "step": 14595 + }, + { + "epoch": 1.8567612263070856, + "grad_norm": 1.5296432971954346, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8674790263175964, + "num_tokens": 557019832.0, + "step": 14596 + }, + { + "epoch": 1.8568884365856761, + "grad_norm": 1.5081201791763306, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.853718638420105, + "num_tokens": 557056408.0, + "step": 14597 + }, + { + "epoch": 1.8570156468642667, + "grad_norm": 1.6234450340270996, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8671861886978149, + "num_tokens": 557091244.0, + "step": 14598 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 1.5039314031600952, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8830500245094299, + "num_tokens": 557126131.0, + "step": 14599 + }, + { + "epoch": 1.8572700674214477, + "grad_norm": 1.5780917406082153, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8708992004394531, + "num_tokens": 557167270.0, + "step": 14600 + }, + { + "epoch": 1.8573972777000383, + "grad_norm": 1.4424641132354736, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8813667297363281, + "num_tokens": 557208228.0, + "step": 14601 + }, + { + "epoch": 1.8575244879786288, + "grad_norm": 1.4670449495315552, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.87894207239151, + "num_tokens": 557247384.0, + "step": 14602 + }, + { + "epoch": 1.8576516982572193, + "grad_norm": 1.5748652219772339, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8723594546318054, + "num_tokens": 557281648.0, + "step": 14603 + }, + { + "epoch": 1.8577789085358098, + "grad_norm": 1.5827662944793701, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.859908938407898, + "num_tokens": 557321977.0, + "step": 14604 + }, + { + "epoch": 1.8579061188144002, + "grad_norm": 1.3995156288146973, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8599751591682434, + "num_tokens": 557368821.0, + "step": 14605 + }, + { + "epoch": 1.8580333290929907, + "grad_norm": 1.4926916360855103, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.874480664730072, + "num_tokens": 557412082.0, + "step": 14606 + }, + { + "epoch": 1.8581605393715812, + "grad_norm": 1.5158122777938843, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8785529732704163, + "num_tokens": 557450613.0, + "step": 14607 + }, + { + "epoch": 1.8582877496501717, + "grad_norm": 1.4614616632461548, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8828414678573608, + "num_tokens": 557492196.0, + "step": 14608 + }, + { + "epoch": 1.8584149599287623, + "grad_norm": 1.4869410991668701, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8862557411193848, + "num_tokens": 557526917.0, + "step": 14609 + }, + { + "epoch": 1.8585421702073528, + "grad_norm": 1.6623880863189697, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8758150339126587, + "num_tokens": 557560420.0, + "step": 14610 + }, + { + "epoch": 1.858669380485943, + "grad_norm": 1.5311676263809204, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8806740045547485, + "num_tokens": 557597129.0, + "step": 14611 + }, + { + "epoch": 1.8587965907645336, + "grad_norm": 1.441824197769165, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8822405934333801, + "num_tokens": 557639037.0, + "step": 14612 + }, + { + "epoch": 1.8589238010431242, + "grad_norm": 1.4579766988754272, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8941441774368286, + "num_tokens": 557678114.0, + "step": 14613 + }, + { + "epoch": 1.8590510113217147, + "grad_norm": 1.5128428936004639, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8759893178939819, + "num_tokens": 557716470.0, + "step": 14614 + }, + { + "epoch": 1.8591782216003052, + "grad_norm": 1.6575814485549927, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.876213788986206, + "num_tokens": 557750589.0, + "step": 14615 + }, + { + "epoch": 1.8593054318788957, + "grad_norm": 1.51373291015625, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8935167789459229, + "num_tokens": 557785664.0, + "step": 14616 + }, + { + "epoch": 1.8594326421574863, + "grad_norm": 1.488174319267273, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8908367156982422, + "num_tokens": 557818988.0, + "step": 14617 + }, + { + "epoch": 1.8595598524360768, + "grad_norm": 1.5020092725753784, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8706809878349304, + "num_tokens": 557857444.0, + "step": 14618 + }, + { + "epoch": 1.8596870627146673, + "grad_norm": 1.4660279750823975, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8757072687149048, + "num_tokens": 557897769.0, + "step": 14619 + }, + { + "epoch": 1.8598142729932579, + "grad_norm": 1.4745453596115112, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8840947151184082, + "num_tokens": 557936145.0, + "step": 14620 + }, + { + "epoch": 1.8599414832718484, + "grad_norm": 1.4039463996887207, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8896931409835815, + "num_tokens": 557972972.0, + "step": 14621 + }, + { + "epoch": 1.860068693550439, + "grad_norm": 1.553621530532837, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8706567883491516, + "num_tokens": 558014923.0, + "step": 14622 + }, + { + "epoch": 1.8601959038290294, + "grad_norm": 1.4670060873031616, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8908278942108154, + "num_tokens": 558049277.0, + "step": 14623 + }, + { + "epoch": 1.86032311410762, + "grad_norm": 1.634434700012207, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8691632747650146, + "num_tokens": 558080491.0, + "step": 14624 + }, + { + "epoch": 1.8604503243862105, + "grad_norm": 1.4631657600402832, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8837651014328003, + "num_tokens": 558118792.0, + "step": 14625 + }, + { + "epoch": 1.860577534664801, + "grad_norm": 1.4690910577774048, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8652764558792114, + "num_tokens": 558163002.0, + "step": 14626 + }, + { + "epoch": 1.8607047449433916, + "grad_norm": 1.3343099355697632, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8977919816970825, + "num_tokens": 558203548.0, + "step": 14627 + }, + { + "epoch": 1.860831955221982, + "grad_norm": 1.4911178350448608, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.882779598236084, + "num_tokens": 558237866.0, + "step": 14628 + }, + { + "epoch": 1.8609591655005726, + "grad_norm": 1.4616411924362183, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8830127716064453, + "num_tokens": 558276916.0, + "step": 14629 + }, + { + "epoch": 1.861086375779163, + "grad_norm": 1.3850973844528198, + "learning_rate": 1e-06, + "loss": 0.2841, + "mean_token_accuracy": 0.898240327835083, + "num_tokens": 558312887.0, + "step": 14630 + }, + { + "epoch": 1.8612135860577534, + "grad_norm": 1.455233097076416, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8765150308609009, + "num_tokens": 558353407.0, + "step": 14631 + }, + { + "epoch": 1.861340796336344, + "grad_norm": 1.4273664951324463, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8848474025726318, + "num_tokens": 558394434.0, + "step": 14632 + }, + { + "epoch": 1.8614680066149345, + "grad_norm": 1.5508347749710083, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.890484094619751, + "num_tokens": 558427859.0, + "step": 14633 + }, + { + "epoch": 1.861595216893525, + "grad_norm": 1.4435845613479614, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8799230456352234, + "num_tokens": 558468891.0, + "step": 14634 + }, + { + "epoch": 1.8617224271721156, + "grad_norm": 1.4518095254898071, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8810176849365234, + "num_tokens": 558506784.0, + "step": 14635 + }, + { + "epoch": 1.8618496374507059, + "grad_norm": 1.5329480171203613, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.878024697303772, + "num_tokens": 558545541.0, + "step": 14636 + }, + { + "epoch": 1.8619768477292964, + "grad_norm": 1.3663543462753296, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8751116394996643, + "num_tokens": 558588704.0, + "step": 14637 + }, + { + "epoch": 1.862104058007887, + "grad_norm": 1.401854395866394, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8908525109291077, + "num_tokens": 558628412.0, + "step": 14638 + }, + { + "epoch": 1.8622312682864774, + "grad_norm": 1.3830993175506592, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8811363577842712, + "num_tokens": 558671315.0, + "step": 14639 + }, + { + "epoch": 1.862358478565068, + "grad_norm": 1.3763669729232788, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8858869671821594, + "num_tokens": 558710544.0, + "step": 14640 + }, + { + "epoch": 1.8624856888436585, + "grad_norm": 1.5711489915847778, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8689882159233093, + "num_tokens": 558751269.0, + "step": 14641 + }, + { + "epoch": 1.862612899122249, + "grad_norm": 1.448044776916504, + "learning_rate": 1e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.8964088559150696, + "num_tokens": 558786527.0, + "step": 14642 + }, + { + "epoch": 1.8627401094008396, + "grad_norm": 1.5355077981948853, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8818641901016235, + "num_tokens": 558824457.0, + "step": 14643 + }, + { + "epoch": 1.86286731967943, + "grad_norm": 1.7491064071655273, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8903435468673706, + "num_tokens": 558853200.0, + "step": 14644 + }, + { + "epoch": 1.8629945299580206, + "grad_norm": 1.4935799837112427, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8591896891593933, + "num_tokens": 558894392.0, + "step": 14645 + }, + { + "epoch": 1.8631217402366111, + "grad_norm": 1.3707361221313477, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.874104917049408, + "num_tokens": 558939826.0, + "step": 14646 + }, + { + "epoch": 1.8632489505152017, + "grad_norm": 1.4943279027938843, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8892652988433838, + "num_tokens": 558975872.0, + "step": 14647 + }, + { + "epoch": 1.8633761607937922, + "grad_norm": 1.470913052558899, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8832200765609741, + "num_tokens": 559018984.0, + "step": 14648 + }, + { + "epoch": 1.8635033710723827, + "grad_norm": 1.547290563583374, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8739498257637024, + "num_tokens": 559055516.0, + "step": 14649 + }, + { + "epoch": 1.8636305813509733, + "grad_norm": 1.6168878078460693, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8825836777687073, + "num_tokens": 559088386.0, + "step": 14650 + }, + { + "epoch": 1.8637577916295638, + "grad_norm": 1.44369375705719, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8773349523544312, + "num_tokens": 559129019.0, + "step": 14651 + }, + { + "epoch": 1.8638850019081543, + "grad_norm": 1.526102900505066, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8703435659408569, + "num_tokens": 559164360.0, + "step": 14652 + }, + { + "epoch": 1.8640122121867448, + "grad_norm": 1.3923827409744263, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8879837989807129, + "num_tokens": 559203853.0, + "step": 14653 + }, + { + "epoch": 1.8641394224653351, + "grad_norm": 1.4178440570831299, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8825784921646118, + "num_tokens": 559244375.0, + "step": 14654 + }, + { + "epoch": 1.8642666327439257, + "grad_norm": 1.5429226160049438, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8774979710578918, + "num_tokens": 559280113.0, + "step": 14655 + }, + { + "epoch": 1.8643938430225162, + "grad_norm": 1.4162615537643433, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8792930841445923, + "num_tokens": 559322063.0, + "step": 14656 + }, + { + "epoch": 1.8645210533011067, + "grad_norm": 1.5288777351379395, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8825170397758484, + "num_tokens": 559357498.0, + "step": 14657 + }, + { + "epoch": 1.8646482635796973, + "grad_norm": 1.6446468830108643, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.864632248878479, + "num_tokens": 559392059.0, + "step": 14658 + }, + { + "epoch": 1.8647754738582878, + "grad_norm": 1.5072822570800781, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8694653511047363, + "num_tokens": 559431168.0, + "step": 14659 + }, + { + "epoch": 1.864902684136878, + "grad_norm": 1.55360746383667, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.875164806842804, + "num_tokens": 559465790.0, + "step": 14660 + }, + { + "epoch": 1.8650298944154686, + "grad_norm": 1.482248306274414, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8851144313812256, + "num_tokens": 559503671.0, + "step": 14661 + }, + { + "epoch": 1.8651571046940592, + "grad_norm": 1.532467007637024, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8777353167533875, + "num_tokens": 559538881.0, + "step": 14662 + }, + { + "epoch": 1.8652843149726497, + "grad_norm": 1.445372462272644, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8755674958229065, + "num_tokens": 559584713.0, + "step": 14663 + }, + { + "epoch": 1.8654115252512402, + "grad_norm": 1.5361461639404297, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.885145902633667, + "num_tokens": 559617477.0, + "step": 14664 + }, + { + "epoch": 1.8655387355298307, + "grad_norm": 1.478540301322937, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8871753811836243, + "num_tokens": 559651453.0, + "step": 14665 + }, + { + "epoch": 1.8656659458084213, + "grad_norm": 1.461853265762329, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8878117203712463, + "num_tokens": 559687211.0, + "step": 14666 + }, + { + "epoch": 1.8657931560870118, + "grad_norm": 1.6262043714523315, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8749210834503174, + "num_tokens": 559720711.0, + "step": 14667 + }, + { + "epoch": 1.8659203663656023, + "grad_norm": 1.569033145904541, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.86850506067276, + "num_tokens": 559755992.0, + "step": 14668 + }, + { + "epoch": 1.8660475766441929, + "grad_norm": 1.3255048990249634, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8894834518432617, + "num_tokens": 559798442.0, + "step": 14669 + }, + { + "epoch": 1.8661747869227834, + "grad_norm": 1.4676518440246582, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8751810789108276, + "num_tokens": 559834660.0, + "step": 14670 + }, + { + "epoch": 1.866301997201374, + "grad_norm": 1.6135669946670532, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8776438236236572, + "num_tokens": 559866319.0, + "step": 14671 + }, + { + "epoch": 1.8664292074799644, + "grad_norm": 1.5002247095108032, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8918685913085938, + "num_tokens": 559897920.0, + "step": 14672 + }, + { + "epoch": 1.866556417758555, + "grad_norm": 1.4004459381103516, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8747067451477051, + "num_tokens": 559941856.0, + "step": 14673 + }, + { + "epoch": 1.8666836280371455, + "grad_norm": 1.4357922077178955, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8746063709259033, + "num_tokens": 559983746.0, + "step": 14674 + }, + { + "epoch": 1.866810838315736, + "grad_norm": 1.4630435705184937, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8673074245452881, + "num_tokens": 560025131.0, + "step": 14675 + }, + { + "epoch": 1.8669380485943265, + "grad_norm": 1.3496382236480713, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8901938796043396, + "num_tokens": 560067643.0, + "step": 14676 + }, + { + "epoch": 1.867065258872917, + "grad_norm": 1.6174125671386719, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8811675310134888, + "num_tokens": 560099116.0, + "step": 14677 + }, + { + "epoch": 1.8671924691515076, + "grad_norm": 1.7350653409957886, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8769540190696716, + "num_tokens": 560131088.0, + "step": 14678 + }, + { + "epoch": 1.867319679430098, + "grad_norm": 1.3747469186782837, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8884278535842896, + "num_tokens": 560171931.0, + "step": 14679 + }, + { + "epoch": 1.8674468897086884, + "grad_norm": 1.6611218452453613, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.861524224281311, + "num_tokens": 560206031.0, + "step": 14680 + }, + { + "epoch": 1.867574099987279, + "grad_norm": 1.6017727851867676, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.873464047908783, + "num_tokens": 560238781.0, + "step": 14681 + }, + { + "epoch": 1.8677013102658695, + "grad_norm": 1.5210018157958984, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8891729712486267, + "num_tokens": 560277519.0, + "step": 14682 + }, + { + "epoch": 1.86782852054446, + "grad_norm": 1.5087319612503052, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8672731518745422, + "num_tokens": 560313892.0, + "step": 14683 + }, + { + "epoch": 1.8679557308230506, + "grad_norm": 1.474858045578003, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8786836266517639, + "num_tokens": 560348997.0, + "step": 14684 + }, + { + "epoch": 1.8680829411016409, + "grad_norm": 1.3555512428283691, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8952515125274658, + "num_tokens": 560390490.0, + "step": 14685 + }, + { + "epoch": 1.8682101513802314, + "grad_norm": 1.3847030401229858, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8838639259338379, + "num_tokens": 560432206.0, + "step": 14686 + }, + { + "epoch": 1.868337361658822, + "grad_norm": 1.5204648971557617, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8636658191680908, + "num_tokens": 560472270.0, + "step": 14687 + }, + { + "epoch": 1.8684645719374124, + "grad_norm": 1.6404879093170166, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8709604144096375, + "num_tokens": 560504828.0, + "step": 14688 + }, + { + "epoch": 1.868591782216003, + "grad_norm": 1.4869194030761719, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8736974000930786, + "num_tokens": 560544410.0, + "step": 14689 + }, + { + "epoch": 1.8687189924945935, + "grad_norm": 1.40509831905365, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8861479759216309, + "num_tokens": 560584769.0, + "step": 14690 + }, + { + "epoch": 1.868846202773184, + "grad_norm": 1.420900821685791, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8712542057037354, + "num_tokens": 560626186.0, + "step": 14691 + }, + { + "epoch": 1.8689734130517746, + "grad_norm": 1.7503974437713623, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8750441074371338, + "num_tokens": 560654728.0, + "step": 14692 + }, + { + "epoch": 1.869100623330365, + "grad_norm": 1.2920722961425781, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.892602801322937, + "num_tokens": 560698739.0, + "step": 14693 + }, + { + "epoch": 1.8692278336089556, + "grad_norm": 1.55971097946167, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8856642246246338, + "num_tokens": 560730075.0, + "step": 14694 + }, + { + "epoch": 1.8693550438875461, + "grad_norm": 1.6140409708023071, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8709398508071899, + "num_tokens": 560769690.0, + "step": 14695 + }, + { + "epoch": 1.8694822541661367, + "grad_norm": 1.3898292779922485, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8666871786117554, + "num_tokens": 560817663.0, + "step": 14696 + }, + { + "epoch": 1.8696094644447272, + "grad_norm": 1.5393438339233398, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8789410591125488, + "num_tokens": 560853649.0, + "step": 14697 + }, + { + "epoch": 1.8697366747233177, + "grad_norm": 1.418166995048523, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8807610273361206, + "num_tokens": 560896197.0, + "step": 14698 + }, + { + "epoch": 1.8698638850019083, + "grad_norm": 1.5877755880355835, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8678194284439087, + "num_tokens": 560930535.0, + "step": 14699 + }, + { + "epoch": 1.8699910952804988, + "grad_norm": 1.637982726097107, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.86669921875, + "num_tokens": 560965027.0, + "step": 14700 + }, + { + "epoch": 1.8701183055590893, + "grad_norm": 1.4290406703948975, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8870660066604614, + "num_tokens": 561004254.0, + "step": 14701 + }, + { + "epoch": 1.8702455158376798, + "grad_norm": 1.6363606452941895, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8715792894363403, + "num_tokens": 561043400.0, + "step": 14702 + }, + { + "epoch": 1.8703727261162701, + "grad_norm": 1.4737670421600342, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.881974458694458, + "num_tokens": 561081900.0, + "step": 14703 + }, + { + "epoch": 1.8704999363948607, + "grad_norm": 1.421616792678833, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8749963641166687, + "num_tokens": 561120966.0, + "step": 14704 + }, + { + "epoch": 1.8706271466734512, + "grad_norm": 1.3445159196853638, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8789942264556885, + "num_tokens": 561163612.0, + "step": 14705 + }, + { + "epoch": 1.8707543569520417, + "grad_norm": 1.4775314331054688, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8849985003471375, + "num_tokens": 561201425.0, + "step": 14706 + }, + { + "epoch": 1.8708815672306323, + "grad_norm": 1.5123193264007568, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8581367135047913, + "num_tokens": 561242280.0, + "step": 14707 + }, + { + "epoch": 1.8710087775092228, + "grad_norm": 1.744840145111084, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8840448260307312, + "num_tokens": 561270013.0, + "step": 14708 + }, + { + "epoch": 1.871135987787813, + "grad_norm": 1.4334027767181396, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8904537558555603, + "num_tokens": 561303090.0, + "step": 14709 + }, + { + "epoch": 1.8712631980664036, + "grad_norm": 1.5172145366668701, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8755663633346558, + "num_tokens": 561340870.0, + "step": 14710 + }, + { + "epoch": 1.8713904083449941, + "grad_norm": 1.4551875591278076, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.878014087677002, + "num_tokens": 561381802.0, + "step": 14711 + }, + { + "epoch": 1.8715176186235847, + "grad_norm": 1.4051709175109863, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8810349106788635, + "num_tokens": 561422606.0, + "step": 14712 + }, + { + "epoch": 1.8716448289021752, + "grad_norm": 1.4505589008331299, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8798078298568726, + "num_tokens": 561463822.0, + "step": 14713 + }, + { + "epoch": 1.8717720391807657, + "grad_norm": 1.413396954536438, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8880143165588379, + "num_tokens": 561504267.0, + "step": 14714 + }, + { + "epoch": 1.8718992494593563, + "grad_norm": 1.4513423442840576, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8809191584587097, + "num_tokens": 561539592.0, + "step": 14715 + }, + { + "epoch": 1.8720264597379468, + "grad_norm": 1.4069634675979614, + "learning_rate": 1e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.8936824202537537, + "num_tokens": 561574649.0, + "step": 14716 + }, + { + "epoch": 1.8721536700165373, + "grad_norm": 1.5410667657852173, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8636313676834106, + "num_tokens": 561614160.0, + "step": 14717 + }, + { + "epoch": 1.8722808802951278, + "grad_norm": 1.5245903730392456, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8683367967605591, + "num_tokens": 561651714.0, + "step": 14718 + }, + { + "epoch": 1.8724080905737184, + "grad_norm": 1.6088829040527344, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8814170956611633, + "num_tokens": 561691766.0, + "step": 14719 + }, + { + "epoch": 1.872535300852309, + "grad_norm": 1.6071555614471436, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8755047917366028, + "num_tokens": 561732779.0, + "step": 14720 + }, + { + "epoch": 1.8726625111308994, + "grad_norm": 1.455933690071106, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8815963268280029, + "num_tokens": 561773826.0, + "step": 14721 + }, + { + "epoch": 1.87278972140949, + "grad_norm": 1.4570841789245605, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8816940188407898, + "num_tokens": 561813026.0, + "step": 14722 + }, + { + "epoch": 1.8729169316880805, + "grad_norm": 1.4317364692687988, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8960114121437073, + "num_tokens": 561848881.0, + "step": 14723 + }, + { + "epoch": 1.873044141966671, + "grad_norm": 1.453428030014038, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8726755380630493, + "num_tokens": 561892308.0, + "step": 14724 + }, + { + "epoch": 1.8731713522452615, + "grad_norm": 1.5355491638183594, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8762648105621338, + "num_tokens": 561928468.0, + "step": 14725 + }, + { + "epoch": 1.873298562523852, + "grad_norm": 1.656110167503357, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8619673252105713, + "num_tokens": 561962733.0, + "step": 14726 + }, + { + "epoch": 1.8734257728024426, + "grad_norm": 1.490131139755249, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8750998973846436, + "num_tokens": 562003185.0, + "step": 14727 + }, + { + "epoch": 1.873552983081033, + "grad_norm": 1.5232124328613281, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8682421445846558, + "num_tokens": 562040525.0, + "step": 14728 + }, + { + "epoch": 1.8736801933596234, + "grad_norm": 1.5779376029968262, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8753911256790161, + "num_tokens": 562077342.0, + "step": 14729 + }, + { + "epoch": 1.873807403638214, + "grad_norm": 1.5748918056488037, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8671005964279175, + "num_tokens": 562116629.0, + "step": 14730 + }, + { + "epoch": 1.8739346139168045, + "grad_norm": 1.2821707725524902, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8856629729270935, + "num_tokens": 562160990.0, + "step": 14731 + }, + { + "epoch": 1.874061824195395, + "grad_norm": 1.3017114400863647, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8859716057777405, + "num_tokens": 562204955.0, + "step": 14732 + }, + { + "epoch": 1.8741890344739855, + "grad_norm": 1.4877318143844604, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8806449174880981, + "num_tokens": 562239739.0, + "step": 14733 + }, + { + "epoch": 1.8743162447525759, + "grad_norm": 1.6306664943695068, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8714379668235779, + "num_tokens": 562274536.0, + "step": 14734 + }, + { + "epoch": 1.8744434550311664, + "grad_norm": 1.5177090167999268, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8810890913009644, + "num_tokens": 562310522.0, + "step": 14735 + }, + { + "epoch": 1.874570665309757, + "grad_norm": 1.524816870689392, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8642673492431641, + "num_tokens": 562347436.0, + "step": 14736 + }, + { + "epoch": 1.8746978755883474, + "grad_norm": 1.3905937671661377, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8939223289489746, + "num_tokens": 562385628.0, + "step": 14737 + }, + { + "epoch": 1.874825085866938, + "grad_norm": 1.3951200246810913, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.873573899269104, + "num_tokens": 562427410.0, + "step": 14738 + }, + { + "epoch": 1.8749522961455285, + "grad_norm": 1.5388827323913574, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8719592094421387, + "num_tokens": 562469080.0, + "step": 14739 + }, + { + "epoch": 1.875079506424119, + "grad_norm": 1.5198930501937866, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8701324462890625, + "num_tokens": 562509344.0, + "step": 14740 + }, + { + "epoch": 1.8752067167027096, + "grad_norm": 1.531714916229248, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8780722618103027, + "num_tokens": 562543364.0, + "step": 14741 + }, + { + "epoch": 1.8753339269813, + "grad_norm": 1.4899204969406128, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8749356865882874, + "num_tokens": 562584635.0, + "step": 14742 + }, + { + "epoch": 1.8754611372598906, + "grad_norm": 1.5751453638076782, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8848327398300171, + "num_tokens": 562618918.0, + "step": 14743 + }, + { + "epoch": 1.8755883475384811, + "grad_norm": 1.35440194606781, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8774288892745972, + "num_tokens": 562662400.0, + "step": 14744 + }, + { + "epoch": 1.8757155578170717, + "grad_norm": 1.4596134424209595, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8900828957557678, + "num_tokens": 562699634.0, + "step": 14745 + }, + { + "epoch": 1.8758427680956622, + "grad_norm": 1.5756778717041016, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8730051517486572, + "num_tokens": 562733078.0, + "step": 14746 + }, + { + "epoch": 1.8759699783742527, + "grad_norm": 1.4150278568267822, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8742552995681763, + "num_tokens": 562775031.0, + "step": 14747 + }, + { + "epoch": 1.8760971886528433, + "grad_norm": 1.4687386751174927, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8822438716888428, + "num_tokens": 562811950.0, + "step": 14748 + }, + { + "epoch": 1.8762243989314338, + "grad_norm": 1.383679747581482, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8891791105270386, + "num_tokens": 562852938.0, + "step": 14749 + }, + { + "epoch": 1.8763516092100243, + "grad_norm": 1.4232786893844604, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8926975727081299, + "num_tokens": 562890248.0, + "step": 14750 + }, + { + "epoch": 1.8764788194886148, + "grad_norm": 1.4559694528579712, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8726975321769714, + "num_tokens": 562929404.0, + "step": 14751 + }, + { + "epoch": 1.8766060297672051, + "grad_norm": 1.5851069688796997, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8771625757217407, + "num_tokens": 562964735.0, + "step": 14752 + }, + { + "epoch": 1.8767332400457957, + "grad_norm": 1.48857843875885, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8859575986862183, + "num_tokens": 562998836.0, + "step": 14753 + }, + { + "epoch": 1.8768604503243862, + "grad_norm": 1.4252755641937256, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8854799866676331, + "num_tokens": 563035947.0, + "step": 14754 + }, + { + "epoch": 1.8769876606029767, + "grad_norm": 1.4452502727508545, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8864595890045166, + "num_tokens": 563075069.0, + "step": 14755 + }, + { + "epoch": 1.8771148708815673, + "grad_norm": 1.472760796546936, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8853116631507874, + "num_tokens": 563112237.0, + "step": 14756 + }, + { + "epoch": 1.8772420811601578, + "grad_norm": 1.394729495048523, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8794747591018677, + "num_tokens": 563153398.0, + "step": 14757 + }, + { + "epoch": 1.877369291438748, + "grad_norm": 1.4801764488220215, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8836362957954407, + "num_tokens": 563191363.0, + "step": 14758 + }, + { + "epoch": 1.8774965017173386, + "grad_norm": 1.5325230360031128, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8745576739311218, + "num_tokens": 563230530.0, + "step": 14759 + }, + { + "epoch": 1.8776237119959291, + "grad_norm": 1.4944008588790894, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8893760442733765, + "num_tokens": 563269020.0, + "step": 14760 + }, + { + "epoch": 1.8777509222745197, + "grad_norm": 1.4739948511123657, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8860800266265869, + "num_tokens": 563308095.0, + "step": 14761 + }, + { + "epoch": 1.8778781325531102, + "grad_norm": 1.5431568622589111, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8782699108123779, + "num_tokens": 563345184.0, + "step": 14762 + }, + { + "epoch": 1.8780053428317007, + "grad_norm": 1.5464038848876953, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8729209303855896, + "num_tokens": 563379331.0, + "step": 14763 + }, + { + "epoch": 1.8781325531102913, + "grad_norm": 1.6780651807785034, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8686144351959229, + "num_tokens": 563413825.0, + "step": 14764 + }, + { + "epoch": 1.8782597633888818, + "grad_norm": 1.5038939714431763, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8859658241271973, + "num_tokens": 563452340.0, + "step": 14765 + }, + { + "epoch": 1.8783869736674723, + "grad_norm": 1.6447662115097046, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8735715746879578, + "num_tokens": 563488386.0, + "step": 14766 + }, + { + "epoch": 1.8785141839460628, + "grad_norm": 1.5462530851364136, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.873298168182373, + "num_tokens": 563526431.0, + "step": 14767 + }, + { + "epoch": 1.8786413942246534, + "grad_norm": 1.4833449125289917, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8872055411338806, + "num_tokens": 563561490.0, + "step": 14768 + }, + { + "epoch": 1.878768604503244, + "grad_norm": 1.4754881858825684, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8946452140808105, + "num_tokens": 563599411.0, + "step": 14769 + }, + { + "epoch": 1.8788958147818344, + "grad_norm": 1.4492712020874023, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8732210397720337, + "num_tokens": 563642420.0, + "step": 14770 + }, + { + "epoch": 1.879023025060425, + "grad_norm": 1.5717167854309082, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8585039973258972, + "num_tokens": 563682895.0, + "step": 14771 + }, + { + "epoch": 1.8791502353390155, + "grad_norm": 1.610829472541809, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8652749061584473, + "num_tokens": 563716699.0, + "step": 14772 + }, + { + "epoch": 1.879277445617606, + "grad_norm": 1.6921451091766357, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8820275068283081, + "num_tokens": 563748304.0, + "step": 14773 + }, + { + "epoch": 1.8794046558961965, + "grad_norm": 1.3908897638320923, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.876772403717041, + "num_tokens": 563791385.0, + "step": 14774 + }, + { + "epoch": 1.879531866174787, + "grad_norm": 1.417310118675232, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8815068006515503, + "num_tokens": 563829816.0, + "step": 14775 + }, + { + "epoch": 1.8796590764533776, + "grad_norm": 1.5178345441818237, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8702619075775146, + "num_tokens": 563865274.0, + "step": 14776 + }, + { + "epoch": 1.879786286731968, + "grad_norm": 1.3627530336380005, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.892877459526062, + "num_tokens": 563906087.0, + "step": 14777 + }, + { + "epoch": 1.8799134970105584, + "grad_norm": 1.5599098205566406, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8753246665000916, + "num_tokens": 563943683.0, + "step": 14778 + }, + { + "epoch": 1.880040707289149, + "grad_norm": 1.480163812637329, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8842519521713257, + "num_tokens": 563980794.0, + "step": 14779 + }, + { + "epoch": 1.8801679175677395, + "grad_norm": 1.4376295804977417, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8823051452636719, + "num_tokens": 564017944.0, + "step": 14780 + }, + { + "epoch": 1.88029512784633, + "grad_norm": 1.3937962055206299, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8831692337989807, + "num_tokens": 564061023.0, + "step": 14781 + }, + { + "epoch": 1.8804223381249205, + "grad_norm": 1.44688880443573, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8848353624343872, + "num_tokens": 564099271.0, + "step": 14782 + }, + { + "epoch": 1.8805495484035109, + "grad_norm": 1.5230153799057007, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.866264820098877, + "num_tokens": 564135556.0, + "step": 14783 + }, + { + "epoch": 1.8806767586821014, + "grad_norm": 1.487420916557312, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8719789385795593, + "num_tokens": 564174479.0, + "step": 14784 + }, + { + "epoch": 1.880803968960692, + "grad_norm": 1.400394082069397, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8798879384994507, + "num_tokens": 564214338.0, + "step": 14785 + }, + { + "epoch": 1.8809311792392824, + "grad_norm": 1.5885192155838013, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8909441232681274, + "num_tokens": 564250313.0, + "step": 14786 + }, + { + "epoch": 1.881058389517873, + "grad_norm": 1.6156809329986572, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8810921311378479, + "num_tokens": 564283590.0, + "step": 14787 + }, + { + "epoch": 1.8811855997964635, + "grad_norm": 1.4708548784255981, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8785841464996338, + "num_tokens": 564327423.0, + "step": 14788 + }, + { + "epoch": 1.881312810075054, + "grad_norm": 1.633255958557129, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8833521604537964, + "num_tokens": 564359295.0, + "step": 14789 + }, + { + "epoch": 1.8814400203536445, + "grad_norm": 1.5018718242645264, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8661706447601318, + "num_tokens": 564400795.0, + "step": 14790 + }, + { + "epoch": 1.881567230632235, + "grad_norm": 1.6402631998062134, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8635286092758179, + "num_tokens": 564435194.0, + "step": 14791 + }, + { + "epoch": 1.8816944409108256, + "grad_norm": 1.499818205833435, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8849034309387207, + "num_tokens": 564472294.0, + "step": 14792 + }, + { + "epoch": 1.8818216511894161, + "grad_norm": 1.5342144966125488, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8703325986862183, + "num_tokens": 564509991.0, + "step": 14793 + }, + { + "epoch": 1.8819488614680067, + "grad_norm": 1.5301975011825562, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8741379976272583, + "num_tokens": 564546996.0, + "step": 14794 + }, + { + "epoch": 1.8820760717465972, + "grad_norm": 1.577798843383789, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8720401525497437, + "num_tokens": 564583019.0, + "step": 14795 + }, + { + "epoch": 1.8822032820251877, + "grad_norm": 1.4538718461990356, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8802751302719116, + "num_tokens": 564620960.0, + "step": 14796 + }, + { + "epoch": 1.8823304923037782, + "grad_norm": 1.4813145399093628, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8729217052459717, + "num_tokens": 564661029.0, + "step": 14797 + }, + { + "epoch": 1.8824577025823688, + "grad_norm": 1.4290809631347656, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8725745677947998, + "num_tokens": 564704527.0, + "step": 14798 + }, + { + "epoch": 1.8825849128609593, + "grad_norm": 1.5829285383224487, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8691657185554504, + "num_tokens": 564738792.0, + "step": 14799 + }, + { + "epoch": 1.8827121231395498, + "grad_norm": 1.415082335472107, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8862742185592651, + "num_tokens": 564777428.0, + "step": 14800 + }, + { + "epoch": 1.8828393334181401, + "grad_norm": 1.5938518047332764, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8772162199020386, + "num_tokens": 564810740.0, + "step": 14801 + }, + { + "epoch": 1.8829665436967307, + "grad_norm": 1.5530762672424316, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8870346546173096, + "num_tokens": 564846051.0, + "step": 14802 + }, + { + "epoch": 1.8830937539753212, + "grad_norm": 1.3646912574768066, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8728973865509033, + "num_tokens": 564890312.0, + "step": 14803 + }, + { + "epoch": 1.8832209642539117, + "grad_norm": 1.5008277893066406, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8646444082260132, + "num_tokens": 564930993.0, + "step": 14804 + }, + { + "epoch": 1.8833481745325023, + "grad_norm": 1.4174880981445312, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8852477073669434, + "num_tokens": 564968628.0, + "step": 14805 + }, + { + "epoch": 1.8834753848110928, + "grad_norm": 1.5449810028076172, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8814843893051147, + "num_tokens": 565006628.0, + "step": 14806 + }, + { + "epoch": 1.883602595089683, + "grad_norm": 1.4461638927459717, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8755749464035034, + "num_tokens": 565047903.0, + "step": 14807 + }, + { + "epoch": 1.8837298053682736, + "grad_norm": 1.7778187990188599, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8856310844421387, + "num_tokens": 565085799.0, + "step": 14808 + }, + { + "epoch": 1.8838570156468641, + "grad_norm": 1.6329883337020874, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8620132207870483, + "num_tokens": 565122388.0, + "step": 14809 + }, + { + "epoch": 1.8839842259254547, + "grad_norm": 1.4814397096633911, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.869219183921814, + "num_tokens": 565159718.0, + "step": 14810 + }, + { + "epoch": 1.8841114362040452, + "grad_norm": 1.4572937488555908, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8821626901626587, + "num_tokens": 565196123.0, + "step": 14811 + }, + { + "epoch": 1.8842386464826357, + "grad_norm": 1.372646689414978, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8790369033813477, + "num_tokens": 565238171.0, + "step": 14812 + }, + { + "epoch": 1.8843658567612263, + "grad_norm": 1.5098075866699219, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8895807266235352, + "num_tokens": 565276004.0, + "step": 14813 + }, + { + "epoch": 1.8844930670398168, + "grad_norm": 1.4818779230117798, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.876457929611206, + "num_tokens": 565315400.0, + "step": 14814 + }, + { + "epoch": 1.8846202773184073, + "grad_norm": 1.6225486993789673, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8699415922164917, + "num_tokens": 565350590.0, + "step": 14815 + }, + { + "epoch": 1.8847474875969978, + "grad_norm": 1.406743049621582, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8799309134483337, + "num_tokens": 565390708.0, + "step": 14816 + }, + { + "epoch": 1.8848746978755884, + "grad_norm": 1.5008423328399658, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8790044784545898, + "num_tokens": 565425704.0, + "step": 14817 + }, + { + "epoch": 1.885001908154179, + "grad_norm": 1.509622573852539, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8838678002357483, + "num_tokens": 565466038.0, + "step": 14818 + }, + { + "epoch": 1.8851291184327694, + "grad_norm": 1.5175999402999878, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8769642114639282, + "num_tokens": 565505142.0, + "step": 14819 + }, + { + "epoch": 1.88525632871136, + "grad_norm": 1.6624748706817627, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8762304782867432, + "num_tokens": 565542192.0, + "step": 14820 + }, + { + "epoch": 1.8853835389899505, + "grad_norm": 1.59751558303833, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8776193857192993, + "num_tokens": 565580245.0, + "step": 14821 + }, + { + "epoch": 1.885510749268541, + "grad_norm": 1.516416072845459, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.885101318359375, + "num_tokens": 565616891.0, + "step": 14822 + }, + { + "epoch": 1.8856379595471315, + "grad_norm": 1.6130174398422241, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8716464638710022, + "num_tokens": 565653168.0, + "step": 14823 + }, + { + "epoch": 1.885765169825722, + "grad_norm": 1.4656094312667847, + "learning_rate": 1e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.8956417441368103, + "num_tokens": 565685651.0, + "step": 14824 + }, + { + "epoch": 1.8858923801043126, + "grad_norm": 1.528865933418274, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8806527256965637, + "num_tokens": 565723171.0, + "step": 14825 + }, + { + "epoch": 1.886019590382903, + "grad_norm": 1.6099425554275513, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8660294413566589, + "num_tokens": 565761105.0, + "step": 14826 + }, + { + "epoch": 1.8861468006614934, + "grad_norm": 1.5584667921066284, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.878820538520813, + "num_tokens": 565794292.0, + "step": 14827 + }, + { + "epoch": 1.886274010940084, + "grad_norm": 1.5877619981765747, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.875584065914154, + "num_tokens": 565828324.0, + "step": 14828 + }, + { + "epoch": 1.8864012212186745, + "grad_norm": 1.5521293878555298, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8752199411392212, + "num_tokens": 565869386.0, + "step": 14829 + }, + { + "epoch": 1.886528431497265, + "grad_norm": 1.5089071989059448, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8859840631484985, + "num_tokens": 565903825.0, + "step": 14830 + }, + { + "epoch": 1.8866556417758555, + "grad_norm": 1.5676243305206299, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8781290054321289, + "num_tokens": 565939294.0, + "step": 14831 + }, + { + "epoch": 1.8867828520544458, + "grad_norm": 1.3341412544250488, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8825532793998718, + "num_tokens": 565982813.0, + "step": 14832 + }, + { + "epoch": 1.8869100623330364, + "grad_norm": 1.6169936656951904, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8677805066108704, + "num_tokens": 566020671.0, + "step": 14833 + }, + { + "epoch": 1.887037272611627, + "grad_norm": 1.432334065437317, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8885046243667603, + "num_tokens": 566059426.0, + "step": 14834 + }, + { + "epoch": 1.8871644828902174, + "grad_norm": 1.518811583518982, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8647091388702393, + "num_tokens": 566098183.0, + "step": 14835 + }, + { + "epoch": 1.887291693168808, + "grad_norm": 1.4262638092041016, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.87909996509552, + "num_tokens": 566138338.0, + "step": 14836 + }, + { + "epoch": 1.8874189034473985, + "grad_norm": 1.4187321662902832, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8752654790878296, + "num_tokens": 566176160.0, + "step": 14837 + }, + { + "epoch": 1.887546113725989, + "grad_norm": 1.5880275964736938, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8880249857902527, + "num_tokens": 566214610.0, + "step": 14838 + }, + { + "epoch": 1.8876733240045795, + "grad_norm": 1.6688545942306519, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8662974834442139, + "num_tokens": 566248382.0, + "step": 14839 + }, + { + "epoch": 1.88780053428317, + "grad_norm": 1.459324598312378, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8700639009475708, + "num_tokens": 566287548.0, + "step": 14840 + }, + { + "epoch": 1.8879277445617606, + "grad_norm": 1.5340465307235718, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8627405166625977, + "num_tokens": 566325405.0, + "step": 14841 + }, + { + "epoch": 1.8880549548403511, + "grad_norm": 1.654896855354309, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8781429529190063, + "num_tokens": 566354592.0, + "step": 14842 + }, + { + "epoch": 1.8881821651189417, + "grad_norm": 1.4546560049057007, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8647831678390503, + "num_tokens": 566403266.0, + "step": 14843 + }, + { + "epoch": 1.8883093753975322, + "grad_norm": 1.5372878313064575, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8695616126060486, + "num_tokens": 566444075.0, + "step": 14844 + }, + { + "epoch": 1.8884365856761227, + "grad_norm": 1.429330587387085, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8803857564926147, + "num_tokens": 566485271.0, + "step": 14845 + }, + { + "epoch": 1.8885637959547132, + "grad_norm": 1.4612959623336792, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8783574104309082, + "num_tokens": 566524341.0, + "step": 14846 + }, + { + "epoch": 1.8886910062333038, + "grad_norm": 1.4820116758346558, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.876420259475708, + "num_tokens": 566559750.0, + "step": 14847 + }, + { + "epoch": 1.8888182165118943, + "grad_norm": 1.3513507843017578, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8687633275985718, + "num_tokens": 566603369.0, + "step": 14848 + }, + { + "epoch": 1.8889454267904848, + "grad_norm": 1.472617268562317, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.872281551361084, + "num_tokens": 566646464.0, + "step": 14849 + }, + { + "epoch": 1.8890726370690751, + "grad_norm": 1.5386978387832642, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.884090781211853, + "num_tokens": 566684462.0, + "step": 14850 + }, + { + "epoch": 1.8891998473476657, + "grad_norm": 1.5979177951812744, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8727710247039795, + "num_tokens": 566723136.0, + "step": 14851 + }, + { + "epoch": 1.8893270576262562, + "grad_norm": 1.4652652740478516, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8784693479537964, + "num_tokens": 566764579.0, + "step": 14852 + }, + { + "epoch": 1.8894542679048467, + "grad_norm": 1.4549624919891357, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8762654066085815, + "num_tokens": 566801319.0, + "step": 14853 + }, + { + "epoch": 1.8895814781834372, + "grad_norm": 1.5859756469726562, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8683094382286072, + "num_tokens": 566836688.0, + "step": 14854 + }, + { + "epoch": 1.8897086884620278, + "grad_norm": 1.5396826267242432, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8763317465782166, + "num_tokens": 566874408.0, + "step": 14855 + }, + { + "epoch": 1.889835898740618, + "grad_norm": 1.5939066410064697, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.861499547958374, + "num_tokens": 566908816.0, + "step": 14856 + }, + { + "epoch": 1.8899631090192086, + "grad_norm": 1.4304325580596924, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.890365719795227, + "num_tokens": 566944996.0, + "step": 14857 + }, + { + "epoch": 1.8900903192977991, + "grad_norm": 1.4109503030776978, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8704782128334045, + "num_tokens": 566986375.0, + "step": 14858 + }, + { + "epoch": 1.8902175295763897, + "grad_norm": 1.403033971786499, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8673381805419922, + "num_tokens": 567031290.0, + "step": 14859 + }, + { + "epoch": 1.8903447398549802, + "grad_norm": 1.3985257148742676, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.881868302822113, + "num_tokens": 567070959.0, + "step": 14860 + }, + { + "epoch": 1.8904719501335707, + "grad_norm": 1.5397025346755981, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8505878448486328, + "num_tokens": 567112409.0, + "step": 14861 + }, + { + "epoch": 1.8905991604121613, + "grad_norm": 1.6008596420288086, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.864195704460144, + "num_tokens": 567147216.0, + "step": 14862 + }, + { + "epoch": 1.8907263706907518, + "grad_norm": 1.685702919960022, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8595685958862305, + "num_tokens": 567189746.0, + "step": 14863 + }, + { + "epoch": 1.8908535809693423, + "grad_norm": 1.7254550457000732, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8778303861618042, + "num_tokens": 567221143.0, + "step": 14864 + }, + { + "epoch": 1.8909807912479328, + "grad_norm": 1.5645873546600342, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8810229897499084, + "num_tokens": 567253146.0, + "step": 14865 + }, + { + "epoch": 1.8911080015265234, + "grad_norm": 1.4332549571990967, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8742880821228027, + "num_tokens": 567294302.0, + "step": 14866 + }, + { + "epoch": 1.891235211805114, + "grad_norm": 1.5515578985214233, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8764298558235168, + "num_tokens": 567328884.0, + "step": 14867 + }, + { + "epoch": 1.8913624220837044, + "grad_norm": 1.3524999618530273, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8917659521102905, + "num_tokens": 567368672.0, + "step": 14868 + }, + { + "epoch": 1.891489632362295, + "grad_norm": 1.4305258989334106, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8762512803077698, + "num_tokens": 567404849.0, + "step": 14869 + }, + { + "epoch": 1.8916168426408855, + "grad_norm": 1.617655634880066, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8853724002838135, + "num_tokens": 567440640.0, + "step": 14870 + }, + { + "epoch": 1.891744052919476, + "grad_norm": 1.6145824193954468, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.869221031665802, + "num_tokens": 567474856.0, + "step": 14871 + }, + { + "epoch": 1.8918712631980665, + "grad_norm": 1.4915101528167725, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8945874571800232, + "num_tokens": 567512058.0, + "step": 14872 + }, + { + "epoch": 1.891998473476657, + "grad_norm": 1.5098364353179932, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8845129013061523, + "num_tokens": 567551356.0, + "step": 14873 + }, + { + "epoch": 1.8921256837552476, + "grad_norm": 1.5921517610549927, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8671584129333496, + "num_tokens": 567585360.0, + "step": 14874 + }, + { + "epoch": 1.892252894033838, + "grad_norm": 1.5376628637313843, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8783622980117798, + "num_tokens": 567623396.0, + "step": 14875 + }, + { + "epoch": 1.8923801043124284, + "grad_norm": 1.560684323310852, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8725187182426453, + "num_tokens": 567658139.0, + "step": 14876 + }, + { + "epoch": 1.892507314591019, + "grad_norm": 1.5220056772232056, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8698776960372925, + "num_tokens": 567698692.0, + "step": 14877 + }, + { + "epoch": 1.8926345248696095, + "grad_norm": 1.5063610076904297, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8790557384490967, + "num_tokens": 567734074.0, + "step": 14878 + }, + { + "epoch": 1.8927617351482, + "grad_norm": 1.5689235925674438, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8862897753715515, + "num_tokens": 567768960.0, + "step": 14879 + }, + { + "epoch": 1.8928889454267905, + "grad_norm": 1.552263855934143, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8697390556335449, + "num_tokens": 567806506.0, + "step": 14880 + }, + { + "epoch": 1.8930161557053808, + "grad_norm": 1.5581079721450806, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8802319765090942, + "num_tokens": 567846079.0, + "step": 14881 + }, + { + "epoch": 1.8931433659839714, + "grad_norm": 1.618018388748169, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8625356554985046, + "num_tokens": 567878514.0, + "step": 14882 + }, + { + "epoch": 1.893270576262562, + "grad_norm": 1.5243725776672363, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8711618781089783, + "num_tokens": 567920821.0, + "step": 14883 + }, + { + "epoch": 1.8933977865411524, + "grad_norm": 1.4768861532211304, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8697575330734253, + "num_tokens": 567963939.0, + "step": 14884 + }, + { + "epoch": 1.893524996819743, + "grad_norm": 1.513490915298462, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8731591701507568, + "num_tokens": 568004171.0, + "step": 14885 + }, + { + "epoch": 1.8936522070983335, + "grad_norm": 1.5818551778793335, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8844122886657715, + "num_tokens": 568039594.0, + "step": 14886 + }, + { + "epoch": 1.893779417376924, + "grad_norm": 1.4104357957839966, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8895671367645264, + "num_tokens": 568077637.0, + "step": 14887 + }, + { + "epoch": 1.8939066276555145, + "grad_norm": 1.4926468133926392, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8818605542182922, + "num_tokens": 568114203.0, + "step": 14888 + }, + { + "epoch": 1.894033837934105, + "grad_norm": 1.615628957748413, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8829516172409058, + "num_tokens": 568147715.0, + "step": 14889 + }, + { + "epoch": 1.8941610482126956, + "grad_norm": 1.4420245885849, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8803956508636475, + "num_tokens": 568188670.0, + "step": 14890 + }, + { + "epoch": 1.8942882584912861, + "grad_norm": 1.470048427581787, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.88655686378479, + "num_tokens": 568224441.0, + "step": 14891 + }, + { + "epoch": 1.8944154687698767, + "grad_norm": 1.5569508075714111, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8610650300979614, + "num_tokens": 568265194.0, + "step": 14892 + }, + { + "epoch": 1.8945426790484672, + "grad_norm": 1.489632248878479, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8681315183639526, + "num_tokens": 568304334.0, + "step": 14893 + }, + { + "epoch": 1.8946698893270577, + "grad_norm": 1.4556607007980347, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8756390810012817, + "num_tokens": 568344312.0, + "step": 14894 + }, + { + "epoch": 1.8947970996056482, + "grad_norm": 1.6528692245483398, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8722028732299805, + "num_tokens": 568379647.0, + "step": 14895 + }, + { + "epoch": 1.8949243098842388, + "grad_norm": 1.5931296348571777, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8802117109298706, + "num_tokens": 568413996.0, + "step": 14896 + }, + { + "epoch": 1.8950515201628293, + "grad_norm": 1.5933486223220825, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8930310606956482, + "num_tokens": 568444745.0, + "step": 14897 + }, + { + "epoch": 1.8951787304414198, + "grad_norm": 1.4449211359024048, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8804734349250793, + "num_tokens": 568481982.0, + "step": 14898 + }, + { + "epoch": 1.8953059407200101, + "grad_norm": 1.5934839248657227, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.862679123878479, + "num_tokens": 568521265.0, + "step": 14899 + }, + { + "epoch": 1.8954331509986007, + "grad_norm": 1.3822215795516968, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8752740621566772, + "num_tokens": 568563015.0, + "step": 14900 + }, + { + "epoch": 1.8955603612771912, + "grad_norm": 1.5516666173934937, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8752156496047974, + "num_tokens": 568596567.0, + "step": 14901 + }, + { + "epoch": 1.8956875715557817, + "grad_norm": 1.495554804801941, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8738185167312622, + "num_tokens": 568637674.0, + "step": 14902 + }, + { + "epoch": 1.8958147818343722, + "grad_norm": 1.4736062288284302, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8702185153961182, + "num_tokens": 568680161.0, + "step": 14903 + }, + { + "epoch": 1.8959419921129628, + "grad_norm": 1.5048162937164307, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8872535228729248, + "num_tokens": 568722056.0, + "step": 14904 + }, + { + "epoch": 1.896069202391553, + "grad_norm": 1.3907545804977417, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8895815014839172, + "num_tokens": 568761068.0, + "step": 14905 + }, + { + "epoch": 1.8961964126701436, + "grad_norm": 1.5074785947799683, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8601612448692322, + "num_tokens": 568804430.0, + "step": 14906 + }, + { + "epoch": 1.8963236229487341, + "grad_norm": 1.4409123659133911, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8846144080162048, + "num_tokens": 568842032.0, + "step": 14907 + }, + { + "epoch": 1.8964508332273247, + "grad_norm": 1.4114757776260376, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8935625553131104, + "num_tokens": 568878037.0, + "step": 14908 + }, + { + "epoch": 1.8965780435059152, + "grad_norm": 1.584739327430725, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8621610403060913, + "num_tokens": 568915085.0, + "step": 14909 + }, + { + "epoch": 1.8967052537845057, + "grad_norm": 1.2970792055130005, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8827908635139465, + "num_tokens": 568960393.0, + "step": 14910 + }, + { + "epoch": 1.8968324640630962, + "grad_norm": 1.691482424736023, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8653822541236877, + "num_tokens": 568993790.0, + "step": 14911 + }, + { + "epoch": 1.8969596743416868, + "grad_norm": 1.4957302808761597, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8788238763809204, + "num_tokens": 569030800.0, + "step": 14912 + }, + { + "epoch": 1.8970868846202773, + "grad_norm": 1.4443601369857788, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8841174840927124, + "num_tokens": 569068666.0, + "step": 14913 + }, + { + "epoch": 1.8972140948988678, + "grad_norm": 1.5547959804534912, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8730905652046204, + "num_tokens": 569104625.0, + "step": 14914 + }, + { + "epoch": 1.8973413051774584, + "grad_norm": 1.388394832611084, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8879855275154114, + "num_tokens": 569147112.0, + "step": 14915 + }, + { + "epoch": 1.8974685154560489, + "grad_norm": 1.5012774467468262, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8662322163581848, + "num_tokens": 569184227.0, + "step": 14916 + }, + { + "epoch": 1.8975957257346394, + "grad_norm": 1.364395260810852, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.884405255317688, + "num_tokens": 569227208.0, + "step": 14917 + }, + { + "epoch": 1.89772293601323, + "grad_norm": 1.5613712072372437, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8882781267166138, + "num_tokens": 569261147.0, + "step": 14918 + }, + { + "epoch": 1.8978501462918205, + "grad_norm": 1.5005439519882202, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8747459650039673, + "num_tokens": 569300477.0, + "step": 14919 + }, + { + "epoch": 1.897977356570411, + "grad_norm": 1.493560791015625, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8885963559150696, + "num_tokens": 569336338.0, + "step": 14920 + }, + { + "epoch": 1.8981045668490015, + "grad_norm": 1.3966115713119507, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8970052003860474, + "num_tokens": 569376741.0, + "step": 14921 + }, + { + "epoch": 1.898231777127592, + "grad_norm": 1.4019094705581665, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8725592494010925, + "num_tokens": 569417946.0, + "step": 14922 + }, + { + "epoch": 1.8983589874061826, + "grad_norm": 1.509298324584961, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.883478045463562, + "num_tokens": 569453855.0, + "step": 14923 + }, + { + "epoch": 1.898486197684773, + "grad_norm": 1.4284520149230957, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8698066473007202, + "num_tokens": 569496142.0, + "step": 14924 + }, + { + "epoch": 1.8986134079633634, + "grad_norm": 1.4523273706436157, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8856154680252075, + "num_tokens": 569532604.0, + "step": 14925 + }, + { + "epoch": 1.898740618241954, + "grad_norm": 1.506581425666809, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8803126811981201, + "num_tokens": 569567418.0, + "step": 14926 + }, + { + "epoch": 1.8988678285205445, + "grad_norm": 1.444559097290039, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8756857514381409, + "num_tokens": 569605559.0, + "step": 14927 + }, + { + "epoch": 1.898995038799135, + "grad_norm": 1.6109821796417236, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.887927770614624, + "num_tokens": 569637107.0, + "step": 14928 + }, + { + "epoch": 1.8991222490777255, + "grad_norm": 1.4476923942565918, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8766629695892334, + "num_tokens": 569675641.0, + "step": 14929 + }, + { + "epoch": 1.8992494593563158, + "grad_norm": 1.5313868522644043, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8768147230148315, + "num_tokens": 569713282.0, + "step": 14930 + }, + { + "epoch": 1.8993766696349064, + "grad_norm": 1.4521185159683228, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8801866769790649, + "num_tokens": 569750043.0, + "step": 14931 + }, + { + "epoch": 1.899503879913497, + "grad_norm": 1.4375571012496948, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8744972944259644, + "num_tokens": 569793434.0, + "step": 14932 + }, + { + "epoch": 1.8996310901920874, + "grad_norm": 1.4858276844024658, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8800034523010254, + "num_tokens": 569833241.0, + "step": 14933 + }, + { + "epoch": 1.899758300470678, + "grad_norm": 1.4571244716644287, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.884520411491394, + "num_tokens": 569868853.0, + "step": 14934 + }, + { + "epoch": 1.8998855107492685, + "grad_norm": 1.5620386600494385, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8719935417175293, + "num_tokens": 569905574.0, + "step": 14935 + }, + { + "epoch": 1.900012721027859, + "grad_norm": 1.4769327640533447, + "learning_rate": 1e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.8996773362159729, + "num_tokens": 569941008.0, + "step": 14936 + }, + { + "epoch": 1.9001399313064495, + "grad_norm": 1.7044758796691895, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8712651133537292, + "num_tokens": 569973112.0, + "step": 14937 + }, + { + "epoch": 1.90026714158504, + "grad_norm": 1.4356151819229126, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8783180713653564, + "num_tokens": 570011929.0, + "step": 14938 + }, + { + "epoch": 1.9003943518636306, + "grad_norm": 1.4002771377563477, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8689193725585938, + "num_tokens": 570053501.0, + "step": 14939 + }, + { + "epoch": 1.9005215621422211, + "grad_norm": 1.5544939041137695, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8687840700149536, + "num_tokens": 570090504.0, + "step": 14940 + }, + { + "epoch": 1.9006487724208116, + "grad_norm": 1.463911533355713, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8821557760238647, + "num_tokens": 570127261.0, + "step": 14941 + }, + { + "epoch": 1.9007759826994022, + "grad_norm": 1.4201198816299438, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8880628347396851, + "num_tokens": 570163254.0, + "step": 14942 + }, + { + "epoch": 1.9009031929779927, + "grad_norm": 1.5032010078430176, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8836092948913574, + "num_tokens": 570201961.0, + "step": 14943 + }, + { + "epoch": 1.9010304032565832, + "grad_norm": 1.4416577816009521, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8770942091941833, + "num_tokens": 570242826.0, + "step": 14944 + }, + { + "epoch": 1.9011576135351738, + "grad_norm": 1.5318453311920166, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8800137042999268, + "num_tokens": 570275796.0, + "step": 14945 + }, + { + "epoch": 1.9012848238137643, + "grad_norm": 1.4761543273925781, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8881404995918274, + "num_tokens": 570313582.0, + "step": 14946 + }, + { + "epoch": 1.9014120340923548, + "grad_norm": 1.5597541332244873, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8801411390304565, + "num_tokens": 570351835.0, + "step": 14947 + }, + { + "epoch": 1.9015392443709451, + "grad_norm": 1.4512457847595215, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8701289892196655, + "num_tokens": 570391722.0, + "step": 14948 + }, + { + "epoch": 1.9016664546495357, + "grad_norm": 1.3835691213607788, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8760125637054443, + "num_tokens": 570431385.0, + "step": 14949 + }, + { + "epoch": 1.9017936649281262, + "grad_norm": 1.5136221647262573, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8855433464050293, + "num_tokens": 570466607.0, + "step": 14950 + }, + { + "epoch": 1.9019208752067167, + "grad_norm": 1.5966490507125854, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8858093619346619, + "num_tokens": 570501227.0, + "step": 14951 + }, + { + "epoch": 1.9020480854853072, + "grad_norm": 1.4047491550445557, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8844264149665833, + "num_tokens": 570542363.0, + "step": 14952 + }, + { + "epoch": 1.9021752957638978, + "grad_norm": 1.4776514768600464, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8899907469749451, + "num_tokens": 570582043.0, + "step": 14953 + }, + { + "epoch": 1.902302506042488, + "grad_norm": 1.439347505569458, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8766036033630371, + "num_tokens": 570622987.0, + "step": 14954 + }, + { + "epoch": 1.9024297163210786, + "grad_norm": 1.5609508752822876, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8736072778701782, + "num_tokens": 570657920.0, + "step": 14955 + }, + { + "epoch": 1.9025569265996691, + "grad_norm": 1.4659491777420044, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8950340151786804, + "num_tokens": 570692422.0, + "step": 14956 + }, + { + "epoch": 1.9026841368782597, + "grad_norm": 1.4287166595458984, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8837082982063293, + "num_tokens": 570734313.0, + "step": 14957 + }, + { + "epoch": 1.9028113471568502, + "grad_norm": 1.5095508098602295, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8928104043006897, + "num_tokens": 570771542.0, + "step": 14958 + }, + { + "epoch": 1.9029385574354407, + "grad_norm": 1.5285509824752808, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8722338676452637, + "num_tokens": 570808500.0, + "step": 14959 + }, + { + "epoch": 1.9030657677140312, + "grad_norm": 1.3586788177490234, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8956005573272705, + "num_tokens": 570847564.0, + "step": 14960 + }, + { + "epoch": 1.9031929779926218, + "grad_norm": 1.4712246656417847, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.877167820930481, + "num_tokens": 570885270.0, + "step": 14961 + }, + { + "epoch": 1.9033201882712123, + "grad_norm": 1.4452844858169556, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8893911242485046, + "num_tokens": 570923049.0, + "step": 14962 + }, + { + "epoch": 1.9034473985498028, + "grad_norm": 1.5733648538589478, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.874566376209259, + "num_tokens": 570957953.0, + "step": 14963 + }, + { + "epoch": 1.9035746088283934, + "grad_norm": 1.550482988357544, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8757990598678589, + "num_tokens": 570993254.0, + "step": 14964 + }, + { + "epoch": 1.9037018191069839, + "grad_norm": 1.6192131042480469, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8774504661560059, + "num_tokens": 571030933.0, + "step": 14965 + }, + { + "epoch": 1.9038290293855744, + "grad_norm": 1.6278964281082153, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8811798095703125, + "num_tokens": 571066046.0, + "step": 14966 + }, + { + "epoch": 1.903956239664165, + "grad_norm": 1.5048185586929321, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8678938746452332, + "num_tokens": 571105323.0, + "step": 14967 + }, + { + "epoch": 1.9040834499427555, + "grad_norm": 1.5539659261703491, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.882080614566803, + "num_tokens": 571141170.0, + "step": 14968 + }, + { + "epoch": 1.904210660221346, + "grad_norm": 1.4308711290359497, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8687489032745361, + "num_tokens": 571185547.0, + "step": 14969 + }, + { + "epoch": 1.9043378704999365, + "grad_norm": 1.6281648874282837, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8759819269180298, + "num_tokens": 571219160.0, + "step": 14970 + }, + { + "epoch": 1.904465080778527, + "grad_norm": 1.451564073562622, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.877565860748291, + "num_tokens": 571255122.0, + "step": 14971 + }, + { + "epoch": 1.9045922910571176, + "grad_norm": 1.6006510257720947, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8805800676345825, + "num_tokens": 571287010.0, + "step": 14972 + }, + { + "epoch": 1.9047195013357079, + "grad_norm": 1.5279781818389893, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8757012486457825, + "num_tokens": 571324425.0, + "step": 14973 + }, + { + "epoch": 1.9048467116142984, + "grad_norm": 1.4582055807113647, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8761621713638306, + "num_tokens": 571364893.0, + "step": 14974 + }, + { + "epoch": 1.904973921892889, + "grad_norm": 1.2723629474639893, + "learning_rate": 1e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.8954796195030212, + "num_tokens": 571409472.0, + "step": 14975 + }, + { + "epoch": 1.9051011321714795, + "grad_norm": 1.6435428857803345, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8637073040008545, + "num_tokens": 571444859.0, + "step": 14976 + }, + { + "epoch": 1.90522834245007, + "grad_norm": 1.5333309173583984, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8550676107406616, + "num_tokens": 571486533.0, + "step": 14977 + }, + { + "epoch": 1.9053555527286605, + "grad_norm": 1.4177329540252686, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8737964630126953, + "num_tokens": 571527937.0, + "step": 14978 + }, + { + "epoch": 1.9054827630072508, + "grad_norm": 1.4394772052764893, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8841004371643066, + "num_tokens": 571568727.0, + "step": 14979 + }, + { + "epoch": 1.9056099732858414, + "grad_norm": 1.4817147254943848, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8774643540382385, + "num_tokens": 571609176.0, + "step": 14980 + }, + { + "epoch": 1.905737183564432, + "grad_norm": 1.4773492813110352, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8810379505157471, + "num_tokens": 571648713.0, + "step": 14981 + }, + { + "epoch": 1.9058643938430224, + "grad_norm": 1.5435394048690796, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8785202503204346, + "num_tokens": 571683530.0, + "step": 14982 + }, + { + "epoch": 1.905991604121613, + "grad_norm": 1.4717854261398315, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8824193477630615, + "num_tokens": 571724155.0, + "step": 14983 + }, + { + "epoch": 1.9061188144002035, + "grad_norm": 1.6033438444137573, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8654208183288574, + "num_tokens": 571762719.0, + "step": 14984 + }, + { + "epoch": 1.906246024678794, + "grad_norm": 1.5503023862838745, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8734533786773682, + "num_tokens": 571802977.0, + "step": 14985 + }, + { + "epoch": 1.9063732349573845, + "grad_norm": 1.5022989511489868, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8853905200958252, + "num_tokens": 571839154.0, + "step": 14986 + }, + { + "epoch": 1.906500445235975, + "grad_norm": 1.5637271404266357, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8759363293647766, + "num_tokens": 571881553.0, + "step": 14987 + }, + { + "epoch": 1.9066276555145656, + "grad_norm": 1.52839994430542, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8832256197929382, + "num_tokens": 571915230.0, + "step": 14988 + }, + { + "epoch": 1.9067548657931561, + "grad_norm": 1.4801225662231445, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.873483419418335, + "num_tokens": 571954073.0, + "step": 14989 + }, + { + "epoch": 1.9068820760717466, + "grad_norm": 1.4719676971435547, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8818345665931702, + "num_tokens": 571993110.0, + "step": 14990 + }, + { + "epoch": 1.9070092863503372, + "grad_norm": 1.5681997537612915, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.871468186378479, + "num_tokens": 572028757.0, + "step": 14991 + }, + { + "epoch": 1.9071364966289277, + "grad_norm": 1.5246816873550415, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8729623556137085, + "num_tokens": 572067105.0, + "step": 14992 + }, + { + "epoch": 1.9072637069075182, + "grad_norm": 1.5479611158370972, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8815138936042786, + "num_tokens": 572103593.0, + "step": 14993 + }, + { + "epoch": 1.9073909171861088, + "grad_norm": 1.598527193069458, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8807653188705444, + "num_tokens": 572136142.0, + "step": 14994 + }, + { + "epoch": 1.9075181274646993, + "grad_norm": 1.5390594005584717, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8851446509361267, + "num_tokens": 572174358.0, + "step": 14995 + }, + { + "epoch": 1.9076453377432898, + "grad_norm": 1.5161023139953613, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8767472505569458, + "num_tokens": 572211944.0, + "step": 14996 + }, + { + "epoch": 1.9077725480218801, + "grad_norm": 1.5905537605285645, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8666445016860962, + "num_tokens": 572244931.0, + "step": 14997 + }, + { + "epoch": 1.9078997583004706, + "grad_norm": 1.634194254875183, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.887131929397583, + "num_tokens": 572277626.0, + "step": 14998 + }, + { + "epoch": 1.9080269685790612, + "grad_norm": 1.6479034423828125, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8554250001907349, + "num_tokens": 572314408.0, + "step": 14999 + }, + { + "epoch": 1.9081541788576517, + "grad_norm": 1.4118022918701172, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8807211518287659, + "num_tokens": 572354870.0, + "step": 15000 + }, + { + "epoch": 1.9082813891362422, + "grad_norm": 1.6544073820114136, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8805570006370544, + "num_tokens": 572386762.0, + "step": 15001 + }, + { + "epoch": 1.9084085994148328, + "grad_norm": 1.460600733757019, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8836976289749146, + "num_tokens": 572425879.0, + "step": 15002 + }, + { + "epoch": 1.908535809693423, + "grad_norm": 1.4545172452926636, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8797032833099365, + "num_tokens": 572466454.0, + "step": 15003 + }, + { + "epoch": 1.9086630199720136, + "grad_norm": 1.5065964460372925, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8657823801040649, + "num_tokens": 572511514.0, + "step": 15004 + }, + { + "epoch": 1.9087902302506041, + "grad_norm": 1.5370274782180786, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8889070749282837, + "num_tokens": 572543021.0, + "step": 15005 + }, + { + "epoch": 1.9089174405291947, + "grad_norm": 1.5303651094436646, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8768380880355835, + "num_tokens": 572579611.0, + "step": 15006 + }, + { + "epoch": 1.9090446508077852, + "grad_norm": 1.4988150596618652, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8699763417243958, + "num_tokens": 572616966.0, + "step": 15007 + }, + { + "epoch": 1.9091718610863757, + "grad_norm": 1.4953818321228027, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8810141682624817, + "num_tokens": 572659990.0, + "step": 15008 + }, + { + "epoch": 1.9092990713649662, + "grad_norm": 1.540114164352417, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.875664234161377, + "num_tokens": 572699404.0, + "step": 15009 + }, + { + "epoch": 1.9094262816435568, + "grad_norm": 1.5957369804382324, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8693070411682129, + "num_tokens": 572735432.0, + "step": 15010 + }, + { + "epoch": 1.9095534919221473, + "grad_norm": 1.486773133277893, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8821688890457153, + "num_tokens": 572773063.0, + "step": 15011 + }, + { + "epoch": 1.9096807022007378, + "grad_norm": 1.3802639245986938, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.872952938079834, + "num_tokens": 572818182.0, + "step": 15012 + }, + { + "epoch": 1.9098079124793284, + "grad_norm": 1.5279767513275146, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8780403137207031, + "num_tokens": 572853717.0, + "step": 15013 + }, + { + "epoch": 1.9099351227579189, + "grad_norm": 1.547436237335205, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8567179441452026, + "num_tokens": 572894076.0, + "step": 15014 + }, + { + "epoch": 1.9100623330365094, + "grad_norm": 1.4149636030197144, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8849461078643799, + "num_tokens": 572937433.0, + "step": 15015 + }, + { + "epoch": 1.9101895433151, + "grad_norm": 1.6030311584472656, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.88197922706604, + "num_tokens": 572969990.0, + "step": 15016 + }, + { + "epoch": 1.9103167535936905, + "grad_norm": 1.4826998710632324, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8724090456962585, + "num_tokens": 573007402.0, + "step": 15017 + }, + { + "epoch": 1.910443963872281, + "grad_norm": 1.4144796133041382, + "learning_rate": 1e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.8957522511482239, + "num_tokens": 573044883.0, + "step": 15018 + }, + { + "epoch": 1.9105711741508715, + "grad_norm": 1.4530409574508667, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8727948069572449, + "num_tokens": 573084550.0, + "step": 15019 + }, + { + "epoch": 1.910698384429462, + "grad_norm": 1.3853827714920044, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.890178918838501, + "num_tokens": 573124593.0, + "step": 15020 + }, + { + "epoch": 1.9108255947080524, + "grad_norm": 1.3552440404891968, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8797283172607422, + "num_tokens": 573171732.0, + "step": 15021 + }, + { + "epoch": 1.9109528049866429, + "grad_norm": 1.5892478227615356, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8671699166297913, + "num_tokens": 573209973.0, + "step": 15022 + }, + { + "epoch": 1.9110800152652334, + "grad_norm": 1.6745489835739136, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8678556680679321, + "num_tokens": 573245669.0, + "step": 15023 + }, + { + "epoch": 1.911207225543824, + "grad_norm": 1.532928228378296, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8732023239135742, + "num_tokens": 573284738.0, + "step": 15024 + }, + { + "epoch": 1.9113344358224145, + "grad_norm": 1.4866421222686768, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8827204704284668, + "num_tokens": 573324282.0, + "step": 15025 + }, + { + "epoch": 1.911461646101005, + "grad_norm": 1.4949008226394653, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8801801204681396, + "num_tokens": 573363739.0, + "step": 15026 + }, + { + "epoch": 1.9115888563795955, + "grad_norm": 1.568307876586914, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.870181679725647, + "num_tokens": 573398998.0, + "step": 15027 + }, + { + "epoch": 1.9117160666581858, + "grad_norm": 1.5947760343551636, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8819199800491333, + "num_tokens": 573433816.0, + "step": 15028 + }, + { + "epoch": 1.9118432769367764, + "grad_norm": 1.5479826927185059, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8823102712631226, + "num_tokens": 573468382.0, + "step": 15029 + }, + { + "epoch": 1.9119704872153669, + "grad_norm": 1.7571983337402344, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.874350905418396, + "num_tokens": 573501129.0, + "step": 15030 + }, + { + "epoch": 1.9120976974939574, + "grad_norm": 1.3626307249069214, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8944125771522522, + "num_tokens": 573542393.0, + "step": 15031 + }, + { + "epoch": 1.912224907772548, + "grad_norm": 1.4069225788116455, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.878905177116394, + "num_tokens": 573580687.0, + "step": 15032 + }, + { + "epoch": 1.9123521180511385, + "grad_norm": 1.345778465270996, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8933406472206116, + "num_tokens": 573624671.0, + "step": 15033 + }, + { + "epoch": 1.912479328329729, + "grad_norm": 1.6048322916030884, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8599735498428345, + "num_tokens": 573660170.0, + "step": 15034 + }, + { + "epoch": 1.9126065386083195, + "grad_norm": 1.432070255279541, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.86683189868927, + "num_tokens": 573705686.0, + "step": 15035 + }, + { + "epoch": 1.91273374888691, + "grad_norm": 1.5929200649261475, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8767491579055786, + "num_tokens": 573743504.0, + "step": 15036 + }, + { + "epoch": 1.9128609591655006, + "grad_norm": 1.4446372985839844, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8766000866889954, + "num_tokens": 573781133.0, + "step": 15037 + }, + { + "epoch": 1.9129881694440911, + "grad_norm": 1.4248833656311035, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8864045143127441, + "num_tokens": 573819548.0, + "step": 15038 + }, + { + "epoch": 1.9131153797226816, + "grad_norm": 1.4080153703689575, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8706283569335938, + "num_tokens": 573862573.0, + "step": 15039 + }, + { + "epoch": 1.9132425900012722, + "grad_norm": 1.4967819452285767, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8709312677383423, + "num_tokens": 573901782.0, + "step": 15040 + }, + { + "epoch": 1.9133698002798627, + "grad_norm": 1.520095705986023, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8853952884674072, + "num_tokens": 573941534.0, + "step": 15041 + }, + { + "epoch": 1.9134970105584532, + "grad_norm": 1.4958537817001343, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8740018606185913, + "num_tokens": 573979985.0, + "step": 15042 + }, + { + "epoch": 1.9136242208370438, + "grad_norm": 1.3782377243041992, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8748878836631775, + "num_tokens": 574021123.0, + "step": 15043 + }, + { + "epoch": 1.9137514311156343, + "grad_norm": 1.428260087966919, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8833640813827515, + "num_tokens": 574061654.0, + "step": 15044 + }, + { + "epoch": 1.9138786413942248, + "grad_norm": 1.4151084423065186, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8636276721954346, + "num_tokens": 574103776.0, + "step": 15045 + }, + { + "epoch": 1.9140058516728151, + "grad_norm": 1.420453429222107, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8897678852081299, + "num_tokens": 574140374.0, + "step": 15046 + }, + { + "epoch": 1.9141330619514056, + "grad_norm": 1.4525785446166992, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8839304447174072, + "num_tokens": 574178649.0, + "step": 15047 + }, + { + "epoch": 1.9142602722299962, + "grad_norm": 1.5429826974868774, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.877327024936676, + "num_tokens": 574216521.0, + "step": 15048 + }, + { + "epoch": 1.9143874825085867, + "grad_norm": 1.4895012378692627, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8785476088523865, + "num_tokens": 574253402.0, + "step": 15049 + }, + { + "epoch": 1.9145146927871772, + "grad_norm": 1.3277019262313843, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8825521469116211, + "num_tokens": 574298219.0, + "step": 15050 + }, + { + "epoch": 1.9146419030657678, + "grad_norm": 1.4620451927185059, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8787903785705566, + "num_tokens": 574336924.0, + "step": 15051 + }, + { + "epoch": 1.914769113344358, + "grad_norm": 1.4722031354904175, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8680403232574463, + "num_tokens": 574377653.0, + "step": 15052 + }, + { + "epoch": 1.9148963236229486, + "grad_norm": 1.4489728212356567, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8848024606704712, + "num_tokens": 574417660.0, + "step": 15053 + }, + { + "epoch": 1.9150235339015391, + "grad_norm": 1.4064887762069702, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8788342475891113, + "num_tokens": 574457415.0, + "step": 15054 + }, + { + "epoch": 1.9151507441801296, + "grad_norm": 1.5015089511871338, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8819319009780884, + "num_tokens": 574494139.0, + "step": 15055 + }, + { + "epoch": 1.9152779544587202, + "grad_norm": 1.5564305782318115, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.878812849521637, + "num_tokens": 574531357.0, + "step": 15056 + }, + { + "epoch": 1.9154051647373107, + "grad_norm": 1.404714584350586, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8716414570808411, + "num_tokens": 574577653.0, + "step": 15057 + }, + { + "epoch": 1.9155323750159012, + "grad_norm": 1.533781886100769, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8749872446060181, + "num_tokens": 574616870.0, + "step": 15058 + }, + { + "epoch": 1.9156595852944918, + "grad_norm": 1.390392541885376, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8690267205238342, + "num_tokens": 574660468.0, + "step": 15059 + }, + { + "epoch": 1.9157867955730823, + "grad_norm": 1.3561245203018188, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8804336786270142, + "num_tokens": 574704495.0, + "step": 15060 + }, + { + "epoch": 1.9159140058516728, + "grad_norm": 1.4993938207626343, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8867716789245605, + "num_tokens": 574743306.0, + "step": 15061 + }, + { + "epoch": 1.9160412161302633, + "grad_norm": 1.5409071445465088, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8846912980079651, + "num_tokens": 574781086.0, + "step": 15062 + }, + { + "epoch": 1.9161684264088539, + "grad_norm": 1.51874577999115, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8761278390884399, + "num_tokens": 574818401.0, + "step": 15063 + }, + { + "epoch": 1.9162956366874444, + "grad_norm": 1.5808483362197876, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8775006532669067, + "num_tokens": 574854185.0, + "step": 15064 + }, + { + "epoch": 1.916422846966035, + "grad_norm": 1.4351979494094849, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8836188912391663, + "num_tokens": 574894461.0, + "step": 15065 + }, + { + "epoch": 1.9165500572446255, + "grad_norm": 1.581896185874939, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.882416844367981, + "num_tokens": 574927918.0, + "step": 15066 + }, + { + "epoch": 1.916677267523216, + "grad_norm": 1.640974521636963, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8793231844902039, + "num_tokens": 574965586.0, + "step": 15067 + }, + { + "epoch": 1.9168044778018065, + "grad_norm": 1.6584540605545044, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8627541661262512, + "num_tokens": 574997929.0, + "step": 15068 + }, + { + "epoch": 1.916931688080397, + "grad_norm": 1.666593313217163, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8749363422393799, + "num_tokens": 575032761.0, + "step": 15069 + }, + { + "epoch": 1.9170588983589874, + "grad_norm": 1.5115667581558228, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8714082837104797, + "num_tokens": 575070183.0, + "step": 15070 + }, + { + "epoch": 1.9171861086375779, + "grad_norm": 1.4350489377975464, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8904525637626648, + "num_tokens": 575108347.0, + "step": 15071 + }, + { + "epoch": 1.9173133189161684, + "grad_norm": 1.540124535560608, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8905469179153442, + "num_tokens": 575142244.0, + "step": 15072 + }, + { + "epoch": 1.917440529194759, + "grad_norm": 1.6870883703231812, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8722531795501709, + "num_tokens": 575173499.0, + "step": 15073 + }, + { + "epoch": 1.9175677394733495, + "grad_norm": 1.5759010314941406, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8769615888595581, + "num_tokens": 575209771.0, + "step": 15074 + }, + { + "epoch": 1.91769494975194, + "grad_norm": 1.3366843461990356, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8837577104568481, + "num_tokens": 575255702.0, + "step": 15075 + }, + { + "epoch": 1.9178221600305305, + "grad_norm": 1.6075626611709595, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8837144374847412, + "num_tokens": 575290687.0, + "step": 15076 + }, + { + "epoch": 1.9179493703091208, + "grad_norm": 1.5543577671051025, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8859934210777283, + "num_tokens": 575322985.0, + "step": 15077 + }, + { + "epoch": 1.9180765805877114, + "grad_norm": 1.4532713890075684, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8842443227767944, + "num_tokens": 575361174.0, + "step": 15078 + }, + { + "epoch": 1.9182037908663019, + "grad_norm": 1.4923533201217651, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8772170543670654, + "num_tokens": 575399727.0, + "step": 15079 + }, + { + "epoch": 1.9183310011448924, + "grad_norm": 1.533125638961792, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8752866387367249, + "num_tokens": 575438261.0, + "step": 15080 + }, + { + "epoch": 1.918458211423483, + "grad_norm": 1.3927282094955444, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8693646192550659, + "num_tokens": 575480369.0, + "step": 15081 + }, + { + "epoch": 1.9185854217020735, + "grad_norm": 1.5205551385879517, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8723232746124268, + "num_tokens": 575522285.0, + "step": 15082 + }, + { + "epoch": 1.918712631980664, + "grad_norm": 1.448356032371521, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8745121955871582, + "num_tokens": 575564444.0, + "step": 15083 + }, + { + "epoch": 1.9188398422592545, + "grad_norm": 1.5120521783828735, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8823215961456299, + "num_tokens": 575600564.0, + "step": 15084 + }, + { + "epoch": 1.918967052537845, + "grad_norm": 1.4958375692367554, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8712769746780396, + "num_tokens": 575642367.0, + "step": 15085 + }, + { + "epoch": 1.9190942628164356, + "grad_norm": 1.4517698287963867, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.889241099357605, + "num_tokens": 575677869.0, + "step": 15086 + }, + { + "epoch": 1.919221473095026, + "grad_norm": 1.5518831014633179, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8777811527252197, + "num_tokens": 575712262.0, + "step": 15087 + }, + { + "epoch": 1.9193486833736166, + "grad_norm": 1.5460842847824097, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8919066190719604, + "num_tokens": 575748301.0, + "step": 15088 + }, + { + "epoch": 1.9194758936522072, + "grad_norm": 1.5398563146591187, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8749794960021973, + "num_tokens": 575785670.0, + "step": 15089 + }, + { + "epoch": 1.9196031039307977, + "grad_norm": 1.5058701038360596, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8813769221305847, + "num_tokens": 575820041.0, + "step": 15090 + }, + { + "epoch": 1.9197303142093882, + "grad_norm": 1.357070803642273, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8842194080352783, + "num_tokens": 575863034.0, + "step": 15091 + }, + { + "epoch": 1.9198575244879788, + "grad_norm": 1.4860960245132446, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8869461417198181, + "num_tokens": 575897585.0, + "step": 15092 + }, + { + "epoch": 1.9199847347665693, + "grad_norm": 1.4588606357574463, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8780779838562012, + "num_tokens": 575935430.0, + "step": 15093 + }, + { + "epoch": 1.9201119450451598, + "grad_norm": 1.6527656316757202, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8801090717315674, + "num_tokens": 575968053.0, + "step": 15094 + }, + { + "epoch": 1.9202391553237501, + "grad_norm": 1.5435881614685059, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8797074556350708, + "num_tokens": 576003170.0, + "step": 15095 + }, + { + "epoch": 1.9203663656023406, + "grad_norm": 1.5781726837158203, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8716607093811035, + "num_tokens": 576040373.0, + "step": 15096 + }, + { + "epoch": 1.9204935758809312, + "grad_norm": 1.3641862869262695, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8850204348564148, + "num_tokens": 576086480.0, + "step": 15097 + }, + { + "epoch": 1.9206207861595217, + "grad_norm": 1.5908764600753784, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8744804263114929, + "num_tokens": 576125538.0, + "step": 15098 + }, + { + "epoch": 1.9207479964381122, + "grad_norm": 1.6097501516342163, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8614217638969421, + "num_tokens": 576164067.0, + "step": 15099 + }, + { + "epoch": 1.9208752067167028, + "grad_norm": 1.734262466430664, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8647580146789551, + "num_tokens": 576203072.0, + "step": 15100 + }, + { + "epoch": 1.921002416995293, + "grad_norm": 1.5546549558639526, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8656518459320068, + "num_tokens": 576240001.0, + "step": 15101 + }, + { + "epoch": 1.9211296272738836, + "grad_norm": 1.494292974472046, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8858034610748291, + "num_tokens": 576278871.0, + "step": 15102 + }, + { + "epoch": 1.9212568375524741, + "grad_norm": 1.4925299882888794, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8802628517150879, + "num_tokens": 576315105.0, + "step": 15103 + }, + { + "epoch": 1.9213840478310646, + "grad_norm": 1.3447750806808472, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8850345015525818, + "num_tokens": 576356718.0, + "step": 15104 + }, + { + "epoch": 1.9215112581096552, + "grad_norm": 1.4747017621994019, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8787677884101868, + "num_tokens": 576392959.0, + "step": 15105 + }, + { + "epoch": 1.9216384683882457, + "grad_norm": 1.635216236114502, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8781408667564392, + "num_tokens": 576425859.0, + "step": 15106 + }, + { + "epoch": 1.9217656786668362, + "grad_norm": 1.4979571104049683, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.879638135433197, + "num_tokens": 576465068.0, + "step": 15107 + }, + { + "epoch": 1.9218928889454268, + "grad_norm": 1.5862280130386353, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8681313395500183, + "num_tokens": 576502335.0, + "step": 15108 + }, + { + "epoch": 1.9220200992240173, + "grad_norm": 1.8388466835021973, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8715532422065735, + "num_tokens": 576532091.0, + "step": 15109 + }, + { + "epoch": 1.9221473095026078, + "grad_norm": 1.514785885810852, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8804208040237427, + "num_tokens": 576567065.0, + "step": 15110 + }, + { + "epoch": 1.9222745197811983, + "grad_norm": 1.5916221141815186, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.883914053440094, + "num_tokens": 576598544.0, + "step": 15111 + }, + { + "epoch": 1.9224017300597889, + "grad_norm": 1.400033950805664, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8897513747215271, + "num_tokens": 576638032.0, + "step": 15112 + }, + { + "epoch": 1.9225289403383794, + "grad_norm": 1.4762341976165771, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8751167058944702, + "num_tokens": 576676135.0, + "step": 15113 + }, + { + "epoch": 1.92265615061697, + "grad_norm": 1.4322059154510498, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8733597993850708, + "num_tokens": 576719395.0, + "step": 15114 + }, + { + "epoch": 1.9227833608955605, + "grad_norm": 1.5474905967712402, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.869215190410614, + "num_tokens": 576761085.0, + "step": 15115 + }, + { + "epoch": 1.922910571174151, + "grad_norm": 1.4768561124801636, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8850878477096558, + "num_tokens": 576799277.0, + "step": 15116 + }, + { + "epoch": 1.9230377814527415, + "grad_norm": 1.6046369075775146, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8730089664459229, + "num_tokens": 576832118.0, + "step": 15117 + }, + { + "epoch": 1.923164991731332, + "grad_norm": 1.6379598379135132, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8807926177978516, + "num_tokens": 576865602.0, + "step": 15118 + }, + { + "epoch": 1.9232922020099223, + "grad_norm": 1.568211555480957, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8819187879562378, + "num_tokens": 576900527.0, + "step": 15119 + }, + { + "epoch": 1.9234194122885129, + "grad_norm": 1.4675829410552979, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8871956467628479, + "num_tokens": 576938225.0, + "step": 15120 + }, + { + "epoch": 1.9235466225671034, + "grad_norm": 1.6563926935195923, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8818906545639038, + "num_tokens": 576970123.0, + "step": 15121 + }, + { + "epoch": 1.923673832845694, + "grad_norm": 1.6903314590454102, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8657072186470032, + "num_tokens": 577002918.0, + "step": 15122 + }, + { + "epoch": 1.9238010431242845, + "grad_norm": 1.472550868988037, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8779934644699097, + "num_tokens": 577040899.0, + "step": 15123 + }, + { + "epoch": 1.923928253402875, + "grad_norm": 1.460207462310791, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8763894438743591, + "num_tokens": 577079458.0, + "step": 15124 + }, + { + "epoch": 1.9240554636814655, + "grad_norm": 1.6211031675338745, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8831385374069214, + "num_tokens": 577110821.0, + "step": 15125 + }, + { + "epoch": 1.9241826739600558, + "grad_norm": 1.501159429550171, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8684359192848206, + "num_tokens": 577151608.0, + "step": 15126 + }, + { + "epoch": 1.9243098842386464, + "grad_norm": 1.5273234844207764, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8675445318222046, + "num_tokens": 577192438.0, + "step": 15127 + }, + { + "epoch": 1.9244370945172369, + "grad_norm": 1.4111756086349487, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8773574829101562, + "num_tokens": 577233399.0, + "step": 15128 + }, + { + "epoch": 1.9245643047958274, + "grad_norm": 1.5736311674118042, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8824939131736755, + "num_tokens": 577270769.0, + "step": 15129 + }, + { + "epoch": 1.924691515074418, + "grad_norm": 1.5791658163070679, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8833723068237305, + "num_tokens": 577302283.0, + "step": 15130 + }, + { + "epoch": 1.9248187253530085, + "grad_norm": 1.572792649269104, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8707054853439331, + "num_tokens": 577336459.0, + "step": 15131 + }, + { + "epoch": 1.924945935631599, + "grad_norm": 1.5037939548492432, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8614501953125, + "num_tokens": 577376224.0, + "step": 15132 + }, + { + "epoch": 1.9250731459101895, + "grad_norm": 1.4560474157333374, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8722023963928223, + "num_tokens": 577413878.0, + "step": 15133 + }, + { + "epoch": 1.92520035618878, + "grad_norm": 1.525650978088379, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8862937092781067, + "num_tokens": 577450901.0, + "step": 15134 + }, + { + "epoch": 1.9253275664673706, + "grad_norm": 1.5532339811325073, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8584214448928833, + "num_tokens": 577489616.0, + "step": 15135 + }, + { + "epoch": 1.925454776745961, + "grad_norm": 1.4771286249160767, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8776980638504028, + "num_tokens": 577528120.0, + "step": 15136 + }, + { + "epoch": 1.9255819870245516, + "grad_norm": 1.4610224962234497, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.884153425693512, + "num_tokens": 577562691.0, + "step": 15137 + }, + { + "epoch": 1.9257091973031422, + "grad_norm": 1.4005956649780273, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8855329751968384, + "num_tokens": 577600480.0, + "step": 15138 + }, + { + "epoch": 1.9258364075817327, + "grad_norm": 1.5915817022323608, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8896484375, + "num_tokens": 577633034.0, + "step": 15139 + }, + { + "epoch": 1.9259636178603232, + "grad_norm": 1.598855972290039, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8548562526702881, + "num_tokens": 577671627.0, + "step": 15140 + }, + { + "epoch": 1.9260908281389137, + "grad_norm": 1.5989511013031006, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8786460161209106, + "num_tokens": 577704911.0, + "step": 15141 + }, + { + "epoch": 1.9262180384175043, + "grad_norm": 1.3786152601242065, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8931061625480652, + "num_tokens": 577738732.0, + "step": 15142 + }, + { + "epoch": 1.9263452486960948, + "grad_norm": 1.4533771276474, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8818435668945312, + "num_tokens": 577776147.0, + "step": 15143 + }, + { + "epoch": 1.926472458974685, + "grad_norm": 1.4041446447372437, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8887627124786377, + "num_tokens": 577817290.0, + "step": 15144 + }, + { + "epoch": 1.9265996692532756, + "grad_norm": 1.6375553607940674, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.873778760433197, + "num_tokens": 577847739.0, + "step": 15145 + }, + { + "epoch": 1.9267268795318662, + "grad_norm": 1.396410584449768, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8861026763916016, + "num_tokens": 577885656.0, + "step": 15146 + }, + { + "epoch": 1.9268540898104567, + "grad_norm": 1.49928879737854, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.882429301738739, + "num_tokens": 577929376.0, + "step": 15147 + }, + { + "epoch": 1.9269813000890472, + "grad_norm": 1.4508593082427979, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8638349771499634, + "num_tokens": 577969112.0, + "step": 15148 + }, + { + "epoch": 1.9271085103676378, + "grad_norm": 1.3669995069503784, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8728199005126953, + "num_tokens": 578012258.0, + "step": 15149 + }, + { + "epoch": 1.927235720646228, + "grad_norm": 1.4791138172149658, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8769898414611816, + "num_tokens": 578051628.0, + "step": 15150 + }, + { + "epoch": 1.9273629309248186, + "grad_norm": 1.4436107873916626, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8632590770721436, + "num_tokens": 578092980.0, + "step": 15151 + }, + { + "epoch": 1.9274901412034091, + "grad_norm": 1.4782979488372803, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8830379247665405, + "num_tokens": 578133476.0, + "step": 15152 + }, + { + "epoch": 1.9276173514819996, + "grad_norm": 1.5005133152008057, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8735188245773315, + "num_tokens": 578175888.0, + "step": 15153 + }, + { + "epoch": 1.9277445617605902, + "grad_norm": 1.5683541297912598, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8724300861358643, + "num_tokens": 578211911.0, + "step": 15154 + }, + { + "epoch": 1.9278717720391807, + "grad_norm": 1.6401400566101074, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8851406574249268, + "num_tokens": 578244192.0, + "step": 15155 + }, + { + "epoch": 1.9279989823177712, + "grad_norm": 1.4629220962524414, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8597313165664673, + "num_tokens": 578289384.0, + "step": 15156 + }, + { + "epoch": 1.9281261925963618, + "grad_norm": 1.5722479820251465, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8696620464324951, + "num_tokens": 578330129.0, + "step": 15157 + }, + { + "epoch": 1.9282534028749523, + "grad_norm": 1.607643723487854, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.877301812171936, + "num_tokens": 578363296.0, + "step": 15158 + }, + { + "epoch": 1.9283806131535428, + "grad_norm": 1.4953198432922363, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8880918025970459, + "num_tokens": 578404573.0, + "step": 15159 + }, + { + "epoch": 1.9285078234321333, + "grad_norm": 1.4886199235916138, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.893700122833252, + "num_tokens": 578436177.0, + "step": 15160 + }, + { + "epoch": 1.9286350337107239, + "grad_norm": 1.3158107995986938, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8884826898574829, + "num_tokens": 578479468.0, + "step": 15161 + }, + { + "epoch": 1.9287622439893144, + "grad_norm": 1.3267648220062256, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8898895978927612, + "num_tokens": 578524039.0, + "step": 15162 + }, + { + "epoch": 1.928889454267905, + "grad_norm": 1.483441948890686, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8826755285263062, + "num_tokens": 578565318.0, + "step": 15163 + }, + { + "epoch": 1.9290166645464955, + "grad_norm": 1.561400294303894, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8827247619628906, + "num_tokens": 578598888.0, + "step": 15164 + }, + { + "epoch": 1.929143874825086, + "grad_norm": 1.5563020706176758, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8706917762756348, + "num_tokens": 578639581.0, + "step": 15165 + }, + { + "epoch": 1.9292710851036765, + "grad_norm": 1.776795506477356, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8643569946289062, + "num_tokens": 578675041.0, + "step": 15166 + }, + { + "epoch": 1.929398295382267, + "grad_norm": 1.4493423700332642, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8851619958877563, + "num_tokens": 578716759.0, + "step": 15167 + }, + { + "epoch": 1.9295255056608573, + "grad_norm": 1.3611791133880615, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8849015831947327, + "num_tokens": 578758765.0, + "step": 15168 + }, + { + "epoch": 1.9296527159394479, + "grad_norm": 1.5898823738098145, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8604608774185181, + "num_tokens": 578795717.0, + "step": 15169 + }, + { + "epoch": 1.9297799262180384, + "grad_norm": 1.4393129348754883, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8840639591217041, + "num_tokens": 578834223.0, + "step": 15170 + }, + { + "epoch": 1.929907136496629, + "grad_norm": 1.496414303779602, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8838925957679749, + "num_tokens": 578871664.0, + "step": 15171 + }, + { + "epoch": 1.9300343467752195, + "grad_norm": 1.6599397659301758, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8720054626464844, + "num_tokens": 578915191.0, + "step": 15172 + }, + { + "epoch": 1.93016155705381, + "grad_norm": 1.5398662090301514, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8692371845245361, + "num_tokens": 578952516.0, + "step": 15173 + }, + { + "epoch": 1.9302887673324005, + "grad_norm": 1.5402644872665405, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.872846245765686, + "num_tokens": 578989280.0, + "step": 15174 + }, + { + "epoch": 1.9304159776109908, + "grad_norm": 1.50991952419281, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8716446757316589, + "num_tokens": 579027177.0, + "step": 15175 + }, + { + "epoch": 1.9305431878895813, + "grad_norm": 1.5091568231582642, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8658599257469177, + "num_tokens": 579064483.0, + "step": 15176 + }, + { + "epoch": 1.9306703981681719, + "grad_norm": 1.429407000541687, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8875941038131714, + "num_tokens": 579102755.0, + "step": 15177 + }, + { + "epoch": 1.9307976084467624, + "grad_norm": 1.6118638515472412, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8852753639221191, + "num_tokens": 579133909.0, + "step": 15178 + }, + { + "epoch": 1.930924818725353, + "grad_norm": 1.4942145347595215, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8776630759239197, + "num_tokens": 579171532.0, + "step": 15179 + }, + { + "epoch": 1.9310520290039435, + "grad_norm": 1.5371489524841309, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8750614523887634, + "num_tokens": 579208461.0, + "step": 15180 + }, + { + "epoch": 1.931179239282534, + "grad_norm": 1.5638383626937866, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8694961071014404, + "num_tokens": 579244841.0, + "step": 15181 + }, + { + "epoch": 1.9313064495611245, + "grad_norm": 1.4672958850860596, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8796791434288025, + "num_tokens": 579282255.0, + "step": 15182 + }, + { + "epoch": 1.931433659839715, + "grad_norm": 1.6584206819534302, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.874829888343811, + "num_tokens": 579313123.0, + "step": 15183 + }, + { + "epoch": 1.9315608701183056, + "grad_norm": 1.4058277606964111, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8910913467407227, + "num_tokens": 579350161.0, + "step": 15184 + }, + { + "epoch": 1.931688080396896, + "grad_norm": 1.4710921049118042, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8780823945999146, + "num_tokens": 579389310.0, + "step": 15185 + }, + { + "epoch": 1.9318152906754866, + "grad_norm": 1.437529444694519, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.877446174621582, + "num_tokens": 579429334.0, + "step": 15186 + }, + { + "epoch": 1.9319425009540772, + "grad_norm": 1.451545000076294, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8814828395843506, + "num_tokens": 579469934.0, + "step": 15187 + }, + { + "epoch": 1.9320697112326677, + "grad_norm": 1.5244559049606323, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8639950156211853, + "num_tokens": 579511258.0, + "step": 15188 + }, + { + "epoch": 1.9321969215112582, + "grad_norm": 1.4130120277404785, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.887961745262146, + "num_tokens": 579551879.0, + "step": 15189 + }, + { + "epoch": 1.9323241317898487, + "grad_norm": 1.501095175743103, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.878079891204834, + "num_tokens": 579587315.0, + "step": 15190 + }, + { + "epoch": 1.9324513420684393, + "grad_norm": 1.5031688213348389, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8705839514732361, + "num_tokens": 579624712.0, + "step": 15191 + }, + { + "epoch": 1.9325785523470298, + "grad_norm": 1.4811452627182007, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8819206953048706, + "num_tokens": 579665183.0, + "step": 15192 + }, + { + "epoch": 1.93270576262562, + "grad_norm": 1.6437259912490845, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8711624145507812, + "num_tokens": 579699334.0, + "step": 15193 + }, + { + "epoch": 1.9328329729042106, + "grad_norm": 1.543831706047058, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8869516849517822, + "num_tokens": 579735463.0, + "step": 15194 + }, + { + "epoch": 1.9329601831828012, + "grad_norm": 1.3881808519363403, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8825473189353943, + "num_tokens": 579778243.0, + "step": 15195 + }, + { + "epoch": 1.9330873934613917, + "grad_norm": 1.5903187990188599, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8828184604644775, + "num_tokens": 579810007.0, + "step": 15196 + }, + { + "epoch": 1.9332146037399822, + "grad_norm": 1.4757322072982788, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8829310536384583, + "num_tokens": 579847276.0, + "step": 15197 + }, + { + "epoch": 1.9333418140185727, + "grad_norm": 1.4471256732940674, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8765360116958618, + "num_tokens": 579885454.0, + "step": 15198 + }, + { + "epoch": 1.933469024297163, + "grad_norm": 1.6913913488388062, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8741029500961304, + "num_tokens": 579919292.0, + "step": 15199 + }, + { + "epoch": 1.9335962345757536, + "grad_norm": 1.3857247829437256, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8761248588562012, + "num_tokens": 579958889.0, + "step": 15200 + }, + { + "epoch": 1.933723444854344, + "grad_norm": 1.4639900922775269, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.889992892742157, + "num_tokens": 579992689.0, + "step": 15201 + }, + { + "epoch": 1.9338506551329346, + "grad_norm": 1.492366909980774, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8710594773292542, + "num_tokens": 580033087.0, + "step": 15202 + }, + { + "epoch": 1.9339778654115252, + "grad_norm": 1.625246286392212, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8586882948875427, + "num_tokens": 580067535.0, + "step": 15203 + }, + { + "epoch": 1.9341050756901157, + "grad_norm": 1.5591732263565063, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8703103065490723, + "num_tokens": 580107219.0, + "step": 15204 + }, + { + "epoch": 1.9342322859687062, + "grad_norm": 1.4609506130218506, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8838884234428406, + "num_tokens": 580143838.0, + "step": 15205 + }, + { + "epoch": 1.9343594962472968, + "grad_norm": 1.8674890995025635, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8588575124740601, + "num_tokens": 580173194.0, + "step": 15206 + }, + { + "epoch": 1.9344867065258873, + "grad_norm": 1.5867071151733398, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8796533942222595, + "num_tokens": 580207116.0, + "step": 15207 + }, + { + "epoch": 1.9346139168044778, + "grad_norm": 1.5223468542099, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8762640953063965, + "num_tokens": 580241588.0, + "step": 15208 + }, + { + "epoch": 1.9347411270830683, + "grad_norm": 1.414262294769287, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8879618644714355, + "num_tokens": 580279469.0, + "step": 15209 + }, + { + "epoch": 1.9348683373616589, + "grad_norm": 1.5233843326568604, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8717122673988342, + "num_tokens": 580317645.0, + "step": 15210 + }, + { + "epoch": 1.9349955476402494, + "grad_norm": 1.415730595588684, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8796296715736389, + "num_tokens": 580356558.0, + "step": 15211 + }, + { + "epoch": 1.93512275791884, + "grad_norm": 1.51595139503479, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8635835647583008, + "num_tokens": 580393782.0, + "step": 15212 + }, + { + "epoch": 1.9352499681974304, + "grad_norm": 1.6043128967285156, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8844181299209595, + "num_tokens": 580427758.0, + "step": 15213 + }, + { + "epoch": 1.935377178476021, + "grad_norm": 1.4636554718017578, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8740646839141846, + "num_tokens": 580468791.0, + "step": 15214 + }, + { + "epoch": 1.9355043887546115, + "grad_norm": 1.48428213596344, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8925515413284302, + "num_tokens": 580502620.0, + "step": 15215 + }, + { + "epoch": 1.935631599033202, + "grad_norm": 1.4250291585922241, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8884510397911072, + "num_tokens": 580540548.0, + "step": 15216 + }, + { + "epoch": 1.9357588093117923, + "grad_norm": 1.5518782138824463, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8744354844093323, + "num_tokens": 580577191.0, + "step": 15217 + }, + { + "epoch": 1.9358860195903829, + "grad_norm": 1.5495383739471436, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8616171479225159, + "num_tokens": 580614598.0, + "step": 15218 + }, + { + "epoch": 1.9360132298689734, + "grad_norm": 1.501196265220642, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8716608285903931, + "num_tokens": 580653438.0, + "step": 15219 + }, + { + "epoch": 1.936140440147564, + "grad_norm": 1.4492924213409424, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8709272742271423, + "num_tokens": 580691534.0, + "step": 15220 + }, + { + "epoch": 1.9362676504261545, + "grad_norm": 1.4824798107147217, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.881062388420105, + "num_tokens": 580725769.0, + "step": 15221 + }, + { + "epoch": 1.936394860704745, + "grad_norm": 1.3620492219924927, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8856821656227112, + "num_tokens": 580768621.0, + "step": 15222 + }, + { + "epoch": 1.9365220709833355, + "grad_norm": 1.370518684387207, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8938882350921631, + "num_tokens": 580806132.0, + "step": 15223 + }, + { + "epoch": 1.9366492812619258, + "grad_norm": 1.5901286602020264, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8649181723594666, + "num_tokens": 580841456.0, + "step": 15224 + }, + { + "epoch": 1.9367764915405163, + "grad_norm": 1.410422682762146, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8889255523681641, + "num_tokens": 580880387.0, + "step": 15225 + }, + { + "epoch": 1.9369037018191069, + "grad_norm": 1.4763089418411255, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8703200817108154, + "num_tokens": 580924104.0, + "step": 15226 + }, + { + "epoch": 1.9370309120976974, + "grad_norm": 1.5202380418777466, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8727573752403259, + "num_tokens": 580960634.0, + "step": 15227 + }, + { + "epoch": 1.937158122376288, + "grad_norm": 1.4726228713989258, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8847024440765381, + "num_tokens": 580999581.0, + "step": 15228 + }, + { + "epoch": 1.9372853326548785, + "grad_norm": 1.4065535068511963, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8890975713729858, + "num_tokens": 581042122.0, + "step": 15229 + }, + { + "epoch": 1.937412542933469, + "grad_norm": 1.6343071460723877, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8777199983596802, + "num_tokens": 581074395.0, + "step": 15230 + }, + { + "epoch": 1.9375397532120595, + "grad_norm": 1.47834312915802, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8666852116584778, + "num_tokens": 581114072.0, + "step": 15231 + }, + { + "epoch": 1.93766696349065, + "grad_norm": 1.5295050144195557, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8800041079521179, + "num_tokens": 581151479.0, + "step": 15232 + }, + { + "epoch": 1.9377941737692406, + "grad_norm": 1.5338321924209595, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8909944295883179, + "num_tokens": 581189805.0, + "step": 15233 + }, + { + "epoch": 1.937921384047831, + "grad_norm": 1.5229418277740479, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8758683204650879, + "num_tokens": 581226498.0, + "step": 15234 + }, + { + "epoch": 1.9380485943264216, + "grad_norm": 1.4715938568115234, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8844641447067261, + "num_tokens": 581263652.0, + "step": 15235 + }, + { + "epoch": 1.9381758046050122, + "grad_norm": 1.476157546043396, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.86966872215271, + "num_tokens": 581301255.0, + "step": 15236 + }, + { + "epoch": 1.9383030148836027, + "grad_norm": 1.5808230638504028, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8795980215072632, + "num_tokens": 581335634.0, + "step": 15237 + }, + { + "epoch": 1.9384302251621932, + "grad_norm": 1.5305230617523193, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8622778654098511, + "num_tokens": 581378518.0, + "step": 15238 + }, + { + "epoch": 1.9385574354407837, + "grad_norm": 1.5825961828231812, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8694535493850708, + "num_tokens": 581418196.0, + "step": 15239 + }, + { + "epoch": 1.9386846457193743, + "grad_norm": 1.5600926876068115, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8680324554443359, + "num_tokens": 581454536.0, + "step": 15240 + }, + { + "epoch": 1.9388118559979648, + "grad_norm": 1.6148555278778076, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8753530979156494, + "num_tokens": 581488066.0, + "step": 15241 + }, + { + "epoch": 1.938939066276555, + "grad_norm": 1.4951454401016235, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8732086420059204, + "num_tokens": 581526890.0, + "step": 15242 + }, + { + "epoch": 1.9390662765551456, + "grad_norm": 1.438501000404358, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8735273480415344, + "num_tokens": 581568023.0, + "step": 15243 + }, + { + "epoch": 1.9391934868337362, + "grad_norm": 1.4878777265548706, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8793908357620239, + "num_tokens": 581604416.0, + "step": 15244 + }, + { + "epoch": 1.9393206971123267, + "grad_norm": 1.4284507036209106, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8838769793510437, + "num_tokens": 581644108.0, + "step": 15245 + }, + { + "epoch": 1.9394479073909172, + "grad_norm": 1.4539769887924194, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.890740156173706, + "num_tokens": 581681000.0, + "step": 15246 + }, + { + "epoch": 1.9395751176695077, + "grad_norm": 1.4670257568359375, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8718482255935669, + "num_tokens": 581720731.0, + "step": 15247 + }, + { + "epoch": 1.939702327948098, + "grad_norm": 1.4146517515182495, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8789021372795105, + "num_tokens": 581759507.0, + "step": 15248 + }, + { + "epoch": 1.9398295382266886, + "grad_norm": 1.4978581666946411, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.880618155002594, + "num_tokens": 581797310.0, + "step": 15249 + }, + { + "epoch": 1.939956748505279, + "grad_norm": 1.6318385601043701, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.863723874092102, + "num_tokens": 581831321.0, + "step": 15250 + }, + { + "epoch": 1.9400839587838696, + "grad_norm": 1.486307144165039, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8886522650718689, + "num_tokens": 581864598.0, + "step": 15251 + }, + { + "epoch": 1.9402111690624602, + "grad_norm": 1.555938959121704, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8746391534805298, + "num_tokens": 581903059.0, + "step": 15252 + }, + { + "epoch": 1.9403383793410507, + "grad_norm": 1.4949201345443726, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8745070695877075, + "num_tokens": 581942173.0, + "step": 15253 + }, + { + "epoch": 1.9404655896196412, + "grad_norm": 1.5716499090194702, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8732976317405701, + "num_tokens": 581979008.0, + "step": 15254 + }, + { + "epoch": 1.9405927998982317, + "grad_norm": 1.4451254606246948, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8790453672409058, + "num_tokens": 582018650.0, + "step": 15255 + }, + { + "epoch": 1.9407200101768223, + "grad_norm": 1.5409643650054932, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8884892463684082, + "num_tokens": 582050014.0, + "step": 15256 + }, + { + "epoch": 1.9408472204554128, + "grad_norm": 1.4015240669250488, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8889803290367126, + "num_tokens": 582087806.0, + "step": 15257 + }, + { + "epoch": 1.9409744307340033, + "grad_norm": 1.3753536939620972, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8797120451927185, + "num_tokens": 582133053.0, + "step": 15258 + }, + { + "epoch": 1.9411016410125939, + "grad_norm": 1.506193995475769, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8814111948013306, + "num_tokens": 582166551.0, + "step": 15259 + }, + { + "epoch": 1.9412288512911844, + "grad_norm": 1.4891105890274048, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8826135993003845, + "num_tokens": 582202163.0, + "step": 15260 + }, + { + "epoch": 1.941356061569775, + "grad_norm": 1.5849086046218872, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.870103657245636, + "num_tokens": 582238558.0, + "step": 15261 + }, + { + "epoch": 1.9414832718483654, + "grad_norm": 1.454492449760437, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8810145258903503, + "num_tokens": 582275960.0, + "step": 15262 + }, + { + "epoch": 1.941610482126956, + "grad_norm": 1.5635689496994019, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8663620352745056, + "num_tokens": 582316539.0, + "step": 15263 + }, + { + "epoch": 1.9417376924055465, + "grad_norm": 1.5918145179748535, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8642575144767761, + "num_tokens": 582351665.0, + "step": 15264 + }, + { + "epoch": 1.941864902684137, + "grad_norm": 1.382690191268921, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8772937059402466, + "num_tokens": 582392861.0, + "step": 15265 + }, + { + "epoch": 1.9419921129627273, + "grad_norm": 1.263135552406311, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.8981577157974243, + "num_tokens": 582436491.0, + "step": 15266 + }, + { + "epoch": 1.9421193232413179, + "grad_norm": 1.4765689373016357, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8835135102272034, + "num_tokens": 582474395.0, + "step": 15267 + }, + { + "epoch": 1.9422465335199084, + "grad_norm": 1.4224802255630493, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8583031892776489, + "num_tokens": 582517537.0, + "step": 15268 + }, + { + "epoch": 1.942373743798499, + "grad_norm": 1.4339710474014282, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8862317800521851, + "num_tokens": 582557437.0, + "step": 15269 + }, + { + "epoch": 1.9425009540770894, + "grad_norm": 1.6100965738296509, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.862662672996521, + "num_tokens": 582594324.0, + "step": 15270 + }, + { + "epoch": 1.94262816435568, + "grad_norm": 1.5394587516784668, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8791406154632568, + "num_tokens": 582632503.0, + "step": 15271 + }, + { + "epoch": 1.9427553746342705, + "grad_norm": 1.4881409406661987, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8675525188446045, + "num_tokens": 582670312.0, + "step": 15272 + }, + { + "epoch": 1.9428825849128608, + "grad_norm": 1.5190019607543945, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8734573721885681, + "num_tokens": 582707502.0, + "step": 15273 + }, + { + "epoch": 1.9430097951914513, + "grad_norm": 1.5056240558624268, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8845263719558716, + "num_tokens": 582744575.0, + "step": 15274 + }, + { + "epoch": 1.9431370054700419, + "grad_norm": 1.418811321258545, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8850233554840088, + "num_tokens": 582784664.0, + "step": 15275 + }, + { + "epoch": 1.9432642157486324, + "grad_norm": 1.5242561101913452, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8684802055358887, + "num_tokens": 582819394.0, + "step": 15276 + }, + { + "epoch": 1.943391426027223, + "grad_norm": 1.5498570203781128, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8864721059799194, + "num_tokens": 582851303.0, + "step": 15277 + }, + { + "epoch": 1.9435186363058135, + "grad_norm": 1.572685718536377, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8636836409568787, + "num_tokens": 582888216.0, + "step": 15278 + }, + { + "epoch": 1.943645846584404, + "grad_norm": 1.459246277809143, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8681751489639282, + "num_tokens": 582928898.0, + "step": 15279 + }, + { + "epoch": 1.9437730568629945, + "grad_norm": 1.3639320135116577, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.890358030796051, + "num_tokens": 582970507.0, + "step": 15280 + }, + { + "epoch": 1.943900267141585, + "grad_norm": 1.434971809387207, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8880516290664673, + "num_tokens": 583009842.0, + "step": 15281 + }, + { + "epoch": 1.9440274774201756, + "grad_norm": 1.5847420692443848, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8847260475158691, + "num_tokens": 583044967.0, + "step": 15282 + }, + { + "epoch": 1.944154687698766, + "grad_norm": 1.3218662738800049, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8932421803474426, + "num_tokens": 583088786.0, + "step": 15283 + }, + { + "epoch": 1.9442818979773566, + "grad_norm": 1.389938473701477, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8900288939476013, + "num_tokens": 583127290.0, + "step": 15284 + }, + { + "epoch": 1.9444091082559471, + "grad_norm": 1.533722996711731, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8804115653038025, + "num_tokens": 583159766.0, + "step": 15285 + }, + { + "epoch": 1.9445363185345377, + "grad_norm": 1.6172314882278442, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8591092824935913, + "num_tokens": 583194112.0, + "step": 15286 + }, + { + "epoch": 1.9446635288131282, + "grad_norm": 1.4789845943450928, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8686275482177734, + "num_tokens": 583234035.0, + "step": 15287 + }, + { + "epoch": 1.9447907390917187, + "grad_norm": 1.4069013595581055, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8843560814857483, + "num_tokens": 583274999.0, + "step": 15288 + }, + { + "epoch": 1.9449179493703093, + "grad_norm": 1.4974191188812256, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8770160675048828, + "num_tokens": 583313571.0, + "step": 15289 + }, + { + "epoch": 1.9450451596488998, + "grad_norm": 1.43293297290802, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8850194215774536, + "num_tokens": 583352039.0, + "step": 15290 + }, + { + "epoch": 1.94517236992749, + "grad_norm": 1.4047491550445557, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8692634701728821, + "num_tokens": 583395528.0, + "step": 15291 + }, + { + "epoch": 1.9452995802060806, + "grad_norm": 1.509507656097412, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8825339674949646, + "num_tokens": 583431099.0, + "step": 15292 + }, + { + "epoch": 1.9454267904846712, + "grad_norm": 1.4695569276809692, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8856854438781738, + "num_tokens": 583467212.0, + "step": 15293 + }, + { + "epoch": 1.9455540007632617, + "grad_norm": 1.4306224584579468, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8862752914428711, + "num_tokens": 583505423.0, + "step": 15294 + }, + { + "epoch": 1.9456812110418522, + "grad_norm": 1.6193374395370483, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8628616333007812, + "num_tokens": 583540043.0, + "step": 15295 + }, + { + "epoch": 1.9458084213204427, + "grad_norm": 1.5331746339797974, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8757190704345703, + "num_tokens": 583579511.0, + "step": 15296 + }, + { + "epoch": 1.945935631599033, + "grad_norm": 1.5010226964950562, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8821106553077698, + "num_tokens": 583615346.0, + "step": 15297 + }, + { + "epoch": 1.9460628418776236, + "grad_norm": 1.471753478050232, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8874799013137817, + "num_tokens": 583651336.0, + "step": 15298 + }, + { + "epoch": 1.946190052156214, + "grad_norm": 1.5059912204742432, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8786149024963379, + "num_tokens": 583690172.0, + "step": 15299 + }, + { + "epoch": 1.9463172624348046, + "grad_norm": 1.4080849885940552, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8706164956092834, + "num_tokens": 583734643.0, + "step": 15300 + }, + { + "epoch": 1.9464444727133952, + "grad_norm": 1.699297547340393, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8549284934997559, + "num_tokens": 583766303.0, + "step": 15301 + }, + { + "epoch": 1.9465716829919857, + "grad_norm": 1.404909610748291, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8786437511444092, + "num_tokens": 583805558.0, + "step": 15302 + }, + { + "epoch": 1.9466988932705762, + "grad_norm": 1.4891823530197144, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.859581470489502, + "num_tokens": 583846552.0, + "step": 15303 + }, + { + "epoch": 1.9468261035491667, + "grad_norm": 1.3957644701004028, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8758262395858765, + "num_tokens": 583885129.0, + "step": 15304 + }, + { + "epoch": 1.9469533138277573, + "grad_norm": 1.5318199396133423, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8648631572723389, + "num_tokens": 583922521.0, + "step": 15305 + }, + { + "epoch": 1.9470805241063478, + "grad_norm": 1.420570731163025, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8917386531829834, + "num_tokens": 583958014.0, + "step": 15306 + }, + { + "epoch": 1.9472077343849383, + "grad_norm": 1.4658563137054443, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8923877477645874, + "num_tokens": 583993051.0, + "step": 15307 + }, + { + "epoch": 1.9473349446635289, + "grad_norm": 1.4004589319229126, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8696227073669434, + "num_tokens": 584039524.0, + "step": 15308 + }, + { + "epoch": 1.9474621549421194, + "grad_norm": 1.478365182876587, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8874369859695435, + "num_tokens": 584078467.0, + "step": 15309 + }, + { + "epoch": 1.94758936522071, + "grad_norm": 1.4648464918136597, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8710442781448364, + "num_tokens": 584119451.0, + "step": 15310 + }, + { + "epoch": 1.9477165754993004, + "grad_norm": 1.5251630544662476, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8779802322387695, + "num_tokens": 584157128.0, + "step": 15311 + }, + { + "epoch": 1.947843785777891, + "grad_norm": 1.525608777999878, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8831720948219299, + "num_tokens": 584191113.0, + "step": 15312 + }, + { + "epoch": 1.9479709960564815, + "grad_norm": 1.4218745231628418, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8776936531066895, + "num_tokens": 584230253.0, + "step": 15313 + }, + { + "epoch": 1.948098206335072, + "grad_norm": 1.586101770401001, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8839998245239258, + "num_tokens": 584263470.0, + "step": 15314 + }, + { + "epoch": 1.9482254166136623, + "grad_norm": 1.6403464078903198, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.8964251279830933, + "num_tokens": 584295674.0, + "step": 15315 + }, + { + "epoch": 1.9483526268922529, + "grad_norm": 1.43638014793396, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8733605146408081, + "num_tokens": 584337276.0, + "step": 15316 + }, + { + "epoch": 1.9484798371708434, + "grad_norm": 1.4675219058990479, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8829621076583862, + "num_tokens": 584378255.0, + "step": 15317 + }, + { + "epoch": 1.948607047449434, + "grad_norm": 1.5461065769195557, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8640869855880737, + "num_tokens": 584414885.0, + "step": 15318 + }, + { + "epoch": 1.9487342577280244, + "grad_norm": 1.60141921043396, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8769949674606323, + "num_tokens": 584446195.0, + "step": 15319 + }, + { + "epoch": 1.948861468006615, + "grad_norm": 1.3544996976852417, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.897419810295105, + "num_tokens": 584485330.0, + "step": 15320 + }, + { + "epoch": 1.9489886782852053, + "grad_norm": 1.4494751691818237, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.867597222328186, + "num_tokens": 584524959.0, + "step": 15321 + }, + { + "epoch": 1.9491158885637958, + "grad_norm": 1.5508590936660767, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8772629499435425, + "num_tokens": 584559426.0, + "step": 15322 + }, + { + "epoch": 1.9492430988423863, + "grad_norm": 1.3893938064575195, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.881150484085083, + "num_tokens": 584598649.0, + "step": 15323 + }, + { + "epoch": 1.9493703091209769, + "grad_norm": 1.4787834882736206, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8799854516983032, + "num_tokens": 584632428.0, + "step": 15324 + }, + { + "epoch": 1.9494975193995674, + "grad_norm": 1.422904372215271, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8698962330818176, + "num_tokens": 584674144.0, + "step": 15325 + }, + { + "epoch": 1.949624729678158, + "grad_norm": 1.5977431535720825, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8696329593658447, + "num_tokens": 584709651.0, + "step": 15326 + }, + { + "epoch": 1.9497519399567484, + "grad_norm": 1.5177737474441528, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8721438646316528, + "num_tokens": 584746915.0, + "step": 15327 + }, + { + "epoch": 1.949879150235339, + "grad_norm": 1.4356589317321777, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8799960017204285, + "num_tokens": 584786316.0, + "step": 15328 + }, + { + "epoch": 1.9500063605139295, + "grad_norm": 1.5425434112548828, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.880750298500061, + "num_tokens": 584822092.0, + "step": 15329 + }, + { + "epoch": 1.95013357079252, + "grad_norm": 1.4826689958572388, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8761494755744934, + "num_tokens": 584861241.0, + "step": 15330 + }, + { + "epoch": 1.9502607810711106, + "grad_norm": 1.4905556440353394, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8936235308647156, + "num_tokens": 584895452.0, + "step": 15331 + }, + { + "epoch": 1.950387991349701, + "grad_norm": 1.5591964721679688, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8652611970901489, + "num_tokens": 584932598.0, + "step": 15332 + }, + { + "epoch": 1.9505152016282916, + "grad_norm": 1.6034859418869019, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8882241249084473, + "num_tokens": 584965465.0, + "step": 15333 + }, + { + "epoch": 1.9506424119068821, + "grad_norm": 1.4755264520645142, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8830878734588623, + "num_tokens": 585006649.0, + "step": 15334 + }, + { + "epoch": 1.9507696221854727, + "grad_norm": 1.352103352546692, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8828226327896118, + "num_tokens": 585049534.0, + "step": 15335 + }, + { + "epoch": 1.9508968324640632, + "grad_norm": 1.4969496726989746, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8773873448371887, + "num_tokens": 585089756.0, + "step": 15336 + }, + { + "epoch": 1.9510240427426537, + "grad_norm": 1.4250199794769287, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8971067070960999, + "num_tokens": 585127225.0, + "step": 15337 + }, + { + "epoch": 1.9511512530212443, + "grad_norm": 1.666764736175537, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8679642677307129, + "num_tokens": 585161005.0, + "step": 15338 + }, + { + "epoch": 1.9512784632998348, + "grad_norm": 1.572412371635437, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8738299012184143, + "num_tokens": 585196830.0, + "step": 15339 + }, + { + "epoch": 1.951405673578425, + "grad_norm": 1.5080074071884155, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8727024793624878, + "num_tokens": 585236996.0, + "step": 15340 + }, + { + "epoch": 1.9515328838570156, + "grad_norm": 1.3428863286972046, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.882921576499939, + "num_tokens": 585279991.0, + "step": 15341 + }, + { + "epoch": 1.9516600941356061, + "grad_norm": 1.3667231798171997, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8887361288070679, + "num_tokens": 585320128.0, + "step": 15342 + }, + { + "epoch": 1.9517873044141967, + "grad_norm": 1.5607742071151733, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8852924704551697, + "num_tokens": 585354691.0, + "step": 15343 + }, + { + "epoch": 1.9519145146927872, + "grad_norm": 1.3486896753311157, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.891952395439148, + "num_tokens": 585396904.0, + "step": 15344 + }, + { + "epoch": 1.9520417249713777, + "grad_norm": 1.4686371088027954, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8935239911079407, + "num_tokens": 585435054.0, + "step": 15345 + }, + { + "epoch": 1.952168935249968, + "grad_norm": 1.5062201023101807, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8864156007766724, + "num_tokens": 585471238.0, + "step": 15346 + }, + { + "epoch": 1.9522961455285586, + "grad_norm": 1.7206922769546509, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8559250831604004, + "num_tokens": 585503410.0, + "step": 15347 + }, + { + "epoch": 1.952423355807149, + "grad_norm": 1.387600302696228, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8916510939598083, + "num_tokens": 585543288.0, + "step": 15348 + }, + { + "epoch": 1.9525505660857396, + "grad_norm": 1.699965238571167, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8825334310531616, + "num_tokens": 585573675.0, + "step": 15349 + }, + { + "epoch": 1.9526777763643302, + "grad_norm": 1.3679428100585938, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8716210126876831, + "num_tokens": 585620439.0, + "step": 15350 + }, + { + "epoch": 1.9528049866429207, + "grad_norm": 1.3714169263839722, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8840614557266235, + "num_tokens": 585663123.0, + "step": 15351 + }, + { + "epoch": 1.9529321969215112, + "grad_norm": 1.4220967292785645, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8841961026191711, + "num_tokens": 585702986.0, + "step": 15352 + }, + { + "epoch": 1.9530594072001017, + "grad_norm": 1.5993412733078003, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8596165776252747, + "num_tokens": 585739665.0, + "step": 15353 + }, + { + "epoch": 1.9531866174786923, + "grad_norm": 1.504531979560852, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8815792798995972, + "num_tokens": 585779558.0, + "step": 15354 + }, + { + "epoch": 1.9533138277572828, + "grad_norm": 1.490502119064331, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8780356049537659, + "num_tokens": 585819330.0, + "step": 15355 + }, + { + "epoch": 1.9534410380358733, + "grad_norm": 1.2824187278747559, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.889654278755188, + "num_tokens": 585866076.0, + "step": 15356 + }, + { + "epoch": 1.9535682483144639, + "grad_norm": 1.4206961393356323, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8725173473358154, + "num_tokens": 585906199.0, + "step": 15357 + }, + { + "epoch": 1.9536954585930544, + "grad_norm": 1.3554481267929077, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8803752660751343, + "num_tokens": 585948523.0, + "step": 15358 + }, + { + "epoch": 1.953822668871645, + "grad_norm": 1.484989047050476, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8827687501907349, + "num_tokens": 585984269.0, + "step": 15359 + }, + { + "epoch": 1.9539498791502354, + "grad_norm": 1.4596210718154907, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8809280395507812, + "num_tokens": 586025625.0, + "step": 15360 + }, + { + "epoch": 1.954077089428826, + "grad_norm": 1.6624064445495605, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8903253078460693, + "num_tokens": 586054808.0, + "step": 15361 + }, + { + "epoch": 1.9542042997074165, + "grad_norm": 1.5719714164733887, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8773086071014404, + "num_tokens": 586091808.0, + "step": 15362 + }, + { + "epoch": 1.954331509986007, + "grad_norm": 1.4393792152404785, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8946375846862793, + "num_tokens": 586128573.0, + "step": 15363 + }, + { + "epoch": 1.9544587202645973, + "grad_norm": 1.4652924537658691, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8843337297439575, + "num_tokens": 586162031.0, + "step": 15364 + }, + { + "epoch": 1.9545859305431879, + "grad_norm": 1.585164189338684, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8790825605392456, + "num_tokens": 586195511.0, + "step": 15365 + }, + { + "epoch": 1.9547131408217784, + "grad_norm": 1.4336315393447876, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8654097318649292, + "num_tokens": 586237825.0, + "step": 15366 + }, + { + "epoch": 1.954840351100369, + "grad_norm": 1.4124529361724854, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8881289958953857, + "num_tokens": 586276537.0, + "step": 15367 + }, + { + "epoch": 1.9549675613789594, + "grad_norm": 1.4454927444458008, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8804691433906555, + "num_tokens": 586318420.0, + "step": 15368 + }, + { + "epoch": 1.95509477165755, + "grad_norm": 1.4568161964416504, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8874877691268921, + "num_tokens": 586355175.0, + "step": 15369 + }, + { + "epoch": 1.9552219819361403, + "grad_norm": 1.480183720588684, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8759489059448242, + "num_tokens": 586392990.0, + "step": 15370 + }, + { + "epoch": 1.9553491922147308, + "grad_norm": 1.515708565711975, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8826594352722168, + "num_tokens": 586431417.0, + "step": 15371 + }, + { + "epoch": 1.9554764024933213, + "grad_norm": 1.5118887424468994, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8840453624725342, + "num_tokens": 586468949.0, + "step": 15372 + }, + { + "epoch": 1.9556036127719119, + "grad_norm": 1.4855515956878662, + "learning_rate": 1e-06, + "loss": 0.273, + "mean_token_accuracy": 0.9009882211685181, + "num_tokens": 586501068.0, + "step": 15373 + }, + { + "epoch": 1.9557308230505024, + "grad_norm": 1.772273063659668, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8751538991928101, + "num_tokens": 586533456.0, + "step": 15374 + }, + { + "epoch": 1.955858033329093, + "grad_norm": 1.4708384275436401, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8792934417724609, + "num_tokens": 586568932.0, + "step": 15375 + }, + { + "epoch": 1.9559852436076834, + "grad_norm": 1.4723528623580933, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8816037178039551, + "num_tokens": 586608168.0, + "step": 15376 + }, + { + "epoch": 1.956112453886274, + "grad_norm": 1.5466911792755127, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8573089838027954, + "num_tokens": 586646568.0, + "step": 15377 + }, + { + "epoch": 1.9562396641648645, + "grad_norm": 1.5839885473251343, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8775388598442078, + "num_tokens": 586678002.0, + "step": 15378 + }, + { + "epoch": 1.956366874443455, + "grad_norm": 1.5988301038742065, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8824161291122437, + "num_tokens": 586710869.0, + "step": 15379 + }, + { + "epoch": 1.9564940847220456, + "grad_norm": 1.624267578125, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8809558749198914, + "num_tokens": 586745729.0, + "step": 15380 + }, + { + "epoch": 1.956621295000636, + "grad_norm": 1.4494997262954712, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8963699340820312, + "num_tokens": 586779450.0, + "step": 15381 + }, + { + "epoch": 1.9567485052792266, + "grad_norm": 1.5618605613708496, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8778846859931946, + "num_tokens": 586816366.0, + "step": 15382 + }, + { + "epoch": 1.9568757155578171, + "grad_norm": 1.4392367601394653, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8764001131057739, + "num_tokens": 586857287.0, + "step": 15383 + }, + { + "epoch": 1.9570029258364077, + "grad_norm": 1.5213549137115479, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8833202719688416, + "num_tokens": 586891646.0, + "step": 15384 + }, + { + "epoch": 1.9571301361149982, + "grad_norm": 1.6655243635177612, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8675327301025391, + "num_tokens": 586926925.0, + "step": 15385 + }, + { + "epoch": 1.9572573463935887, + "grad_norm": 1.5068912506103516, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8830271363258362, + "num_tokens": 586963130.0, + "step": 15386 + }, + { + "epoch": 1.9573845566721793, + "grad_norm": 1.5240628719329834, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8635200262069702, + "num_tokens": 587001697.0, + "step": 15387 + }, + { + "epoch": 1.9575117669507698, + "grad_norm": 1.4801126718521118, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.862864077091217, + "num_tokens": 587044581.0, + "step": 15388 + }, + { + "epoch": 1.95763897722936, + "grad_norm": 1.3887701034545898, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8780561089515686, + "num_tokens": 587085576.0, + "step": 15389 + }, + { + "epoch": 1.9577661875079506, + "grad_norm": 1.5046254396438599, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.886764407157898, + "num_tokens": 587118952.0, + "step": 15390 + }, + { + "epoch": 1.9578933977865411, + "grad_norm": 1.659065842628479, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8746814727783203, + "num_tokens": 587149302.0, + "step": 15391 + }, + { + "epoch": 1.9580206080651317, + "grad_norm": 1.3721281290054321, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8805691599845886, + "num_tokens": 587193676.0, + "step": 15392 + }, + { + "epoch": 1.9581478183437222, + "grad_norm": 1.4253926277160645, + "learning_rate": 1e-06, + "loss": 0.2812, + "mean_token_accuracy": 0.8990266919136047, + "num_tokens": 587228286.0, + "step": 15393 + }, + { + "epoch": 1.9582750286223127, + "grad_norm": 1.528059959411621, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.884677529335022, + "num_tokens": 587263110.0, + "step": 15394 + }, + { + "epoch": 1.958402238900903, + "grad_norm": 1.4322434663772583, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8734006881713867, + "num_tokens": 587305630.0, + "step": 15395 + }, + { + "epoch": 1.9585294491794936, + "grad_norm": 1.4491859674453735, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8887931108474731, + "num_tokens": 587341977.0, + "step": 15396 + }, + { + "epoch": 1.958656659458084, + "grad_norm": 1.5562068223953247, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8936868906021118, + "num_tokens": 587372402.0, + "step": 15397 + }, + { + "epoch": 1.9587838697366746, + "grad_norm": 1.3759714365005493, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8761754035949707, + "num_tokens": 587415279.0, + "step": 15398 + }, + { + "epoch": 1.9589110800152651, + "grad_norm": 1.4934395551681519, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8766083717346191, + "num_tokens": 587451225.0, + "step": 15399 + }, + { + "epoch": 1.9590382902938557, + "grad_norm": 1.4765452146530151, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8836201429367065, + "num_tokens": 587484408.0, + "step": 15400 + }, + { + "epoch": 1.9591655005724462, + "grad_norm": 1.3983752727508545, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8783305883407593, + "num_tokens": 587525845.0, + "step": 15401 + }, + { + "epoch": 1.9592927108510367, + "grad_norm": 1.5415483713150024, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.873259425163269, + "num_tokens": 587560893.0, + "step": 15402 + }, + { + "epoch": 1.9594199211296273, + "grad_norm": 1.5195949077606201, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8748244047164917, + "num_tokens": 587599793.0, + "step": 15403 + }, + { + "epoch": 1.9595471314082178, + "grad_norm": 1.5352585315704346, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8886970281600952, + "num_tokens": 587632806.0, + "step": 15404 + }, + { + "epoch": 1.9596743416868083, + "grad_norm": 1.5815101861953735, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8832147121429443, + "num_tokens": 587665722.0, + "step": 15405 + }, + { + "epoch": 1.9598015519653988, + "grad_norm": 1.5976618528366089, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8765936493873596, + "num_tokens": 587699541.0, + "step": 15406 + }, + { + "epoch": 1.9599287622439894, + "grad_norm": 1.426551103591919, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8776919841766357, + "num_tokens": 587741755.0, + "step": 15407 + }, + { + "epoch": 1.96005597252258, + "grad_norm": 1.3944792747497559, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8840537667274475, + "num_tokens": 587780924.0, + "step": 15408 + }, + { + "epoch": 1.9601831828011704, + "grad_norm": 1.4245322942733765, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8847938776016235, + "num_tokens": 587819143.0, + "step": 15409 + }, + { + "epoch": 1.960310393079761, + "grad_norm": 1.4203734397888184, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8601254224777222, + "num_tokens": 587866902.0, + "step": 15410 + }, + { + "epoch": 1.9604376033583515, + "grad_norm": 1.4988890886306763, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8627907037734985, + "num_tokens": 587909621.0, + "step": 15411 + }, + { + "epoch": 1.960564813636942, + "grad_norm": 1.547720193862915, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8787306547164917, + "num_tokens": 587948573.0, + "step": 15412 + }, + { + "epoch": 1.9606920239155323, + "grad_norm": 1.4102617502212524, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8755967020988464, + "num_tokens": 587992755.0, + "step": 15413 + }, + { + "epoch": 1.9608192341941229, + "grad_norm": 1.6348011493682861, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8751857280731201, + "num_tokens": 588027466.0, + "step": 15414 + }, + { + "epoch": 1.9609464444727134, + "grad_norm": 1.481056809425354, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8854646682739258, + "num_tokens": 588064660.0, + "step": 15415 + }, + { + "epoch": 1.961073654751304, + "grad_norm": 1.4115533828735352, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8775019645690918, + "num_tokens": 588106869.0, + "step": 15416 + }, + { + "epoch": 1.9612008650298944, + "grad_norm": 1.5675809383392334, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8773198127746582, + "num_tokens": 588141545.0, + "step": 15417 + }, + { + "epoch": 1.961328075308485, + "grad_norm": 1.3740248680114746, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.889974057674408, + "num_tokens": 588180655.0, + "step": 15418 + }, + { + "epoch": 1.9614552855870753, + "grad_norm": 1.4612988233566284, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.893510103225708, + "num_tokens": 588213453.0, + "step": 15419 + }, + { + "epoch": 1.9615824958656658, + "grad_norm": 1.4565812349319458, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8800125122070312, + "num_tokens": 588253389.0, + "step": 15420 + }, + { + "epoch": 1.9617097061442563, + "grad_norm": 1.4773297309875488, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8675898313522339, + "num_tokens": 588293088.0, + "step": 15421 + }, + { + "epoch": 1.9618369164228469, + "grad_norm": 1.5924019813537598, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8662451505661011, + "num_tokens": 588327853.0, + "step": 15422 + }, + { + "epoch": 1.9619641267014374, + "grad_norm": 1.5627431869506836, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8780930042266846, + "num_tokens": 588360079.0, + "step": 15423 + }, + { + "epoch": 1.962091336980028, + "grad_norm": 1.3892375230789185, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8730043172836304, + "num_tokens": 588404617.0, + "step": 15424 + }, + { + "epoch": 1.9622185472586184, + "grad_norm": 1.422570824623108, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8760095238685608, + "num_tokens": 588445043.0, + "step": 15425 + }, + { + "epoch": 1.962345757537209, + "grad_norm": 1.4645073413848877, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8708086609840393, + "num_tokens": 588486125.0, + "step": 15426 + }, + { + "epoch": 1.9624729678157995, + "grad_norm": 1.4880162477493286, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8776072263717651, + "num_tokens": 588523492.0, + "step": 15427 + }, + { + "epoch": 1.96260017809439, + "grad_norm": 1.3379175662994385, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.884635865688324, + "num_tokens": 588567307.0, + "step": 15428 + }, + { + "epoch": 1.9627273883729806, + "grad_norm": 1.516676664352417, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8904455900192261, + "num_tokens": 588601959.0, + "step": 15429 + }, + { + "epoch": 1.962854598651571, + "grad_norm": 1.5875369310379028, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8779535293579102, + "num_tokens": 588638020.0, + "step": 15430 + }, + { + "epoch": 1.9629818089301616, + "grad_norm": 1.352945327758789, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8911906480789185, + "num_tokens": 588677505.0, + "step": 15431 + }, + { + "epoch": 1.9631090192087521, + "grad_norm": 1.4252920150756836, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8845912218093872, + "num_tokens": 588718735.0, + "step": 15432 + }, + { + "epoch": 1.9632362294873427, + "grad_norm": 1.376272439956665, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.879729151725769, + "num_tokens": 588761096.0, + "step": 15433 + }, + { + "epoch": 1.9633634397659332, + "grad_norm": 1.5494213104248047, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8779507875442505, + "num_tokens": 588795888.0, + "step": 15434 + }, + { + "epoch": 1.9634906500445237, + "grad_norm": 1.3985849618911743, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8809078931808472, + "num_tokens": 588838115.0, + "step": 15435 + }, + { + "epoch": 1.9636178603231143, + "grad_norm": 1.3653818368911743, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8852611780166626, + "num_tokens": 588879944.0, + "step": 15436 + }, + { + "epoch": 1.9637450706017048, + "grad_norm": 1.4313695430755615, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.857531726360321, + "num_tokens": 588925579.0, + "step": 15437 + }, + { + "epoch": 1.963872280880295, + "grad_norm": 1.4660056829452515, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8786556720733643, + "num_tokens": 588963263.0, + "step": 15438 + }, + { + "epoch": 1.9639994911588856, + "grad_norm": 1.7961856126785278, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8805243968963623, + "num_tokens": 588995019.0, + "step": 15439 + }, + { + "epoch": 1.9641267014374761, + "grad_norm": 1.4699926376342773, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8838986754417419, + "num_tokens": 589031576.0, + "step": 15440 + }, + { + "epoch": 1.9642539117160667, + "grad_norm": 1.477677345275879, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.878456711769104, + "num_tokens": 589067597.0, + "step": 15441 + }, + { + "epoch": 1.9643811219946572, + "grad_norm": 1.471159815788269, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8961288928985596, + "num_tokens": 589104766.0, + "step": 15442 + }, + { + "epoch": 1.9645083322732477, + "grad_norm": 1.511285424232483, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.891384482383728, + "num_tokens": 589136237.0, + "step": 15443 + }, + { + "epoch": 1.964635542551838, + "grad_norm": 1.423359990119934, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8720426559448242, + "num_tokens": 589177184.0, + "step": 15444 + }, + { + "epoch": 1.9647627528304286, + "grad_norm": 1.3596851825714111, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8870710730552673, + "num_tokens": 589214874.0, + "step": 15445 + }, + { + "epoch": 1.964889963109019, + "grad_norm": 1.4804037809371948, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8759948015213013, + "num_tokens": 589252842.0, + "step": 15446 + }, + { + "epoch": 1.9650171733876096, + "grad_norm": 1.709262490272522, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8553898334503174, + "num_tokens": 589289288.0, + "step": 15447 + }, + { + "epoch": 1.9651443836662001, + "grad_norm": 1.4232654571533203, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8778560161590576, + "num_tokens": 589330234.0, + "step": 15448 + }, + { + "epoch": 1.9652715939447907, + "grad_norm": 1.3588149547576904, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.890761137008667, + "num_tokens": 589370460.0, + "step": 15449 + }, + { + "epoch": 1.9653988042233812, + "grad_norm": 1.4277275800704956, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8733466267585754, + "num_tokens": 589412582.0, + "step": 15450 + }, + { + "epoch": 1.9655260145019717, + "grad_norm": 1.52337646484375, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8747777342796326, + "num_tokens": 589451472.0, + "step": 15451 + }, + { + "epoch": 1.9656532247805623, + "grad_norm": 1.4898929595947266, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.883949875831604, + "num_tokens": 589489022.0, + "step": 15452 + }, + { + "epoch": 1.9657804350591528, + "grad_norm": 1.4123319387435913, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.88639897108078, + "num_tokens": 589527449.0, + "step": 15453 + }, + { + "epoch": 1.9659076453377433, + "grad_norm": 1.4644355773925781, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8806033134460449, + "num_tokens": 589565299.0, + "step": 15454 + }, + { + "epoch": 1.9660348556163338, + "grad_norm": 1.619329571723938, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8633167743682861, + "num_tokens": 589601119.0, + "step": 15455 + }, + { + "epoch": 1.9661620658949244, + "grad_norm": 1.5487362146377563, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8693955540657043, + "num_tokens": 589636765.0, + "step": 15456 + }, + { + "epoch": 1.966289276173515, + "grad_norm": 1.6002721786499023, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.861443042755127, + "num_tokens": 589673536.0, + "step": 15457 + }, + { + "epoch": 1.9664164864521054, + "grad_norm": 1.5963574647903442, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8786742687225342, + "num_tokens": 589705312.0, + "step": 15458 + }, + { + "epoch": 1.966543696730696, + "grad_norm": 1.4829894304275513, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8870423436164856, + "num_tokens": 589743501.0, + "step": 15459 + }, + { + "epoch": 1.9666709070092865, + "grad_norm": 1.4004193544387817, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8855910301208496, + "num_tokens": 589782444.0, + "step": 15460 + }, + { + "epoch": 1.966798117287877, + "grad_norm": 1.5479111671447754, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8787069916725159, + "num_tokens": 589818929.0, + "step": 15461 + }, + { + "epoch": 1.9669253275664673, + "grad_norm": 1.635506272315979, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8808977603912354, + "num_tokens": 589852242.0, + "step": 15462 + }, + { + "epoch": 1.9670525378450578, + "grad_norm": 1.6759809255599976, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8829812407493591, + "num_tokens": 589885408.0, + "step": 15463 + }, + { + "epoch": 1.9671797481236484, + "grad_norm": 1.5052716732025146, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8908523321151733, + "num_tokens": 589919810.0, + "step": 15464 + }, + { + "epoch": 1.967306958402239, + "grad_norm": 1.4288101196289062, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8792845606803894, + "num_tokens": 589962228.0, + "step": 15465 + }, + { + "epoch": 1.9674341686808294, + "grad_norm": 1.3365695476531982, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.897591233253479, + "num_tokens": 590003403.0, + "step": 15466 + }, + { + "epoch": 1.96756137895942, + "grad_norm": 1.5405083894729614, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8824185132980347, + "num_tokens": 590039264.0, + "step": 15467 + }, + { + "epoch": 1.9676885892380103, + "grad_norm": 1.6008206605911255, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8674069046974182, + "num_tokens": 590075511.0, + "step": 15468 + }, + { + "epoch": 1.9678157995166008, + "grad_norm": 1.4766677618026733, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8843100070953369, + "num_tokens": 590111649.0, + "step": 15469 + }, + { + "epoch": 1.9679430097951913, + "grad_norm": 1.4245171546936035, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.890251874923706, + "num_tokens": 590151614.0, + "step": 15470 + }, + { + "epoch": 1.9680702200737819, + "grad_norm": 1.4434144496917725, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8848789930343628, + "num_tokens": 590188528.0, + "step": 15471 + }, + { + "epoch": 1.9681974303523724, + "grad_norm": 1.51443612575531, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8911693096160889, + "num_tokens": 590220805.0, + "step": 15472 + }, + { + "epoch": 1.968324640630963, + "grad_norm": 1.5605422258377075, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.887139081954956, + "num_tokens": 590254272.0, + "step": 15473 + }, + { + "epoch": 1.9684518509095534, + "grad_norm": 1.5887142419815063, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8799141645431519, + "num_tokens": 590288277.0, + "step": 15474 + }, + { + "epoch": 1.968579061188144, + "grad_norm": 1.5804729461669922, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8761841058731079, + "num_tokens": 590324298.0, + "step": 15475 + }, + { + "epoch": 1.9687062714667345, + "grad_norm": 1.3872476816177368, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8915026187896729, + "num_tokens": 590365669.0, + "step": 15476 + }, + { + "epoch": 1.968833481745325, + "grad_norm": 1.4694486856460571, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8749840259552002, + "num_tokens": 590402809.0, + "step": 15477 + }, + { + "epoch": 1.9689606920239155, + "grad_norm": 1.407198190689087, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8833555579185486, + "num_tokens": 590441578.0, + "step": 15478 + }, + { + "epoch": 1.969087902302506, + "grad_norm": 1.5153471231460571, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8799012899398804, + "num_tokens": 590477958.0, + "step": 15479 + }, + { + "epoch": 1.9692151125810966, + "grad_norm": 1.677412986755371, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.867048442363739, + "num_tokens": 590511366.0, + "step": 15480 + }, + { + "epoch": 1.9693423228596871, + "grad_norm": 1.4528074264526367, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8835923671722412, + "num_tokens": 590549312.0, + "step": 15481 + }, + { + "epoch": 1.9694695331382777, + "grad_norm": 1.4568325281143188, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8818181753158569, + "num_tokens": 590587159.0, + "step": 15482 + }, + { + "epoch": 1.9695967434168682, + "grad_norm": 1.408674955368042, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.881899893283844, + "num_tokens": 590627977.0, + "step": 15483 + }, + { + "epoch": 1.9697239536954587, + "grad_norm": 1.3900799751281738, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8834720849990845, + "num_tokens": 590667837.0, + "step": 15484 + }, + { + "epoch": 1.9698511639740492, + "grad_norm": 1.4887394905090332, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8797615170478821, + "num_tokens": 590707652.0, + "step": 15485 + }, + { + "epoch": 1.9699783742526398, + "grad_norm": 1.5517486333847046, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8697335124015808, + "num_tokens": 590745974.0, + "step": 15486 + }, + { + "epoch": 1.97010558453123, + "grad_norm": 1.5880491733551025, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8763697147369385, + "num_tokens": 590780566.0, + "step": 15487 + }, + { + "epoch": 1.9702327948098206, + "grad_norm": 1.4247610569000244, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.875354528427124, + "num_tokens": 590821673.0, + "step": 15488 + }, + { + "epoch": 1.9703600050884111, + "grad_norm": 1.6138836145401, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8590081930160522, + "num_tokens": 590862443.0, + "step": 15489 + }, + { + "epoch": 1.9704872153670017, + "grad_norm": 1.5050787925720215, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8923057317733765, + "num_tokens": 590895830.0, + "step": 15490 + }, + { + "epoch": 1.9706144256455922, + "grad_norm": 1.4259155988693237, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8669525384902954, + "num_tokens": 590941023.0, + "step": 15491 + }, + { + "epoch": 1.9707416359241827, + "grad_norm": 1.597251057624817, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8701838850975037, + "num_tokens": 590977593.0, + "step": 15492 + }, + { + "epoch": 1.970868846202773, + "grad_norm": 1.4799761772155762, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8823354244232178, + "num_tokens": 591014898.0, + "step": 15493 + }, + { + "epoch": 1.9709960564813636, + "grad_norm": 1.3897602558135986, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8881374597549438, + "num_tokens": 591051493.0, + "step": 15494 + }, + { + "epoch": 1.971123266759954, + "grad_norm": 1.4240095615386963, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8893591165542603, + "num_tokens": 591091769.0, + "step": 15495 + }, + { + "epoch": 1.9712504770385446, + "grad_norm": 1.7465499639511108, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8506909608840942, + "num_tokens": 591124699.0, + "step": 15496 + }, + { + "epoch": 1.9713776873171351, + "grad_norm": 1.5819449424743652, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.883256196975708, + "num_tokens": 591159618.0, + "step": 15497 + }, + { + "epoch": 1.9715048975957257, + "grad_norm": 1.351986289024353, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8844155073165894, + "num_tokens": 591199954.0, + "step": 15498 + }, + { + "epoch": 1.9716321078743162, + "grad_norm": 1.4523003101348877, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8761858940124512, + "num_tokens": 591240196.0, + "step": 15499 + }, + { + "epoch": 1.9717593181529067, + "grad_norm": 1.4233901500701904, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.869103193283081, + "num_tokens": 591279696.0, + "step": 15500 + }, + { + "epoch": 1.9718865284314973, + "grad_norm": 1.5433772802352905, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8666542768478394, + "num_tokens": 591312815.0, + "step": 15501 + }, + { + "epoch": 1.9720137387100878, + "grad_norm": 1.5729408264160156, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.878538966178894, + "num_tokens": 591345225.0, + "step": 15502 + }, + { + "epoch": 1.9721409489886783, + "grad_norm": 1.6528370380401611, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.867652177810669, + "num_tokens": 591376083.0, + "step": 15503 + }, + { + "epoch": 1.9722681592672688, + "grad_norm": 1.530158281326294, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8643573522567749, + "num_tokens": 591417920.0, + "step": 15504 + }, + { + "epoch": 1.9723953695458594, + "grad_norm": 1.411698341369629, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8680930733680725, + "num_tokens": 591461720.0, + "step": 15505 + }, + { + "epoch": 1.97252257982445, + "grad_norm": 1.3711689710617065, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8877091407775879, + "num_tokens": 591500565.0, + "step": 15506 + }, + { + "epoch": 1.9726497901030404, + "grad_norm": 1.4109939336776733, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8878837823867798, + "num_tokens": 591537296.0, + "step": 15507 + }, + { + "epoch": 1.972777000381631, + "grad_norm": 1.3112971782684326, + "learning_rate": 1e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.8933491706848145, + "num_tokens": 591577487.0, + "step": 15508 + }, + { + "epoch": 1.9729042106602215, + "grad_norm": 1.562071442604065, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8887503147125244, + "num_tokens": 591611831.0, + "step": 15509 + }, + { + "epoch": 1.973031420938812, + "grad_norm": 1.5797616243362427, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8870620727539062, + "num_tokens": 591642769.0, + "step": 15510 + }, + { + "epoch": 1.9731586312174023, + "grad_norm": 1.4148025512695312, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8887181878089905, + "num_tokens": 591681084.0, + "step": 15511 + }, + { + "epoch": 1.9732858414959928, + "grad_norm": 1.5070819854736328, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8835134506225586, + "num_tokens": 591716451.0, + "step": 15512 + }, + { + "epoch": 1.9734130517745834, + "grad_norm": 1.5990318059921265, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8737479448318481, + "num_tokens": 591753427.0, + "step": 15513 + }, + { + "epoch": 1.973540262053174, + "grad_norm": 1.5388234853744507, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8900085687637329, + "num_tokens": 591787645.0, + "step": 15514 + }, + { + "epoch": 1.9736674723317644, + "grad_norm": 1.7624784708023071, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8772094249725342, + "num_tokens": 591821729.0, + "step": 15515 + }, + { + "epoch": 1.973794682610355, + "grad_norm": 1.4648656845092773, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8746433258056641, + "num_tokens": 591859737.0, + "step": 15516 + }, + { + "epoch": 1.9739218928889453, + "grad_norm": 1.7649224996566772, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8761765360832214, + "num_tokens": 591891481.0, + "step": 15517 + }, + { + "epoch": 1.9740491031675358, + "grad_norm": 1.523666501045227, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8826915621757507, + "num_tokens": 591924530.0, + "step": 15518 + }, + { + "epoch": 1.9741763134461263, + "grad_norm": 1.45271635055542, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.872765302658081, + "num_tokens": 591962524.0, + "step": 15519 + }, + { + "epoch": 1.9743035237247168, + "grad_norm": 1.432023048400879, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8837045431137085, + "num_tokens": 592006267.0, + "step": 15520 + }, + { + "epoch": 1.9744307340033074, + "grad_norm": 1.4961752891540527, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8913235664367676, + "num_tokens": 592041253.0, + "step": 15521 + }, + { + "epoch": 1.974557944281898, + "grad_norm": 1.632352590560913, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8832465410232544, + "num_tokens": 592073291.0, + "step": 15522 + }, + { + "epoch": 1.9746851545604884, + "grad_norm": 1.5422827005386353, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.85423344373703, + "num_tokens": 592114513.0, + "step": 15523 + }, + { + "epoch": 1.974812364839079, + "grad_norm": 1.4696918725967407, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8728644847869873, + "num_tokens": 592155914.0, + "step": 15524 + }, + { + "epoch": 1.9749395751176695, + "grad_norm": 1.4751323461532593, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8701339960098267, + "num_tokens": 592198552.0, + "step": 15525 + }, + { + "epoch": 1.97506678539626, + "grad_norm": 1.4410611391067505, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8638339042663574, + "num_tokens": 592240156.0, + "step": 15526 + }, + { + "epoch": 1.9751939956748505, + "grad_norm": 1.504630208015442, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8688414096832275, + "num_tokens": 592278733.0, + "step": 15527 + }, + { + "epoch": 1.975321205953441, + "grad_norm": 1.4445509910583496, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8811667561531067, + "num_tokens": 592316005.0, + "step": 15528 + }, + { + "epoch": 1.9754484162320316, + "grad_norm": 1.6372634172439575, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8713575601577759, + "num_tokens": 592348873.0, + "step": 15529 + }, + { + "epoch": 1.9755756265106221, + "grad_norm": 1.5529835224151611, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8664940595626831, + "num_tokens": 592385681.0, + "step": 15530 + }, + { + "epoch": 1.9757028367892127, + "grad_norm": 1.5935498476028442, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8660844564437866, + "num_tokens": 592419944.0, + "step": 15531 + }, + { + "epoch": 1.9758300470678032, + "grad_norm": 1.498841643333435, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8816415071487427, + "num_tokens": 592455832.0, + "step": 15532 + }, + { + "epoch": 1.9759572573463937, + "grad_norm": 1.4182276725769043, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8831731081008911, + "num_tokens": 592494478.0, + "step": 15533 + }, + { + "epoch": 1.9760844676249842, + "grad_norm": 1.5775506496429443, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.876054048538208, + "num_tokens": 592534827.0, + "step": 15534 + }, + { + "epoch": 1.9762116779035748, + "grad_norm": 1.4647068977355957, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8851824402809143, + "num_tokens": 592571226.0, + "step": 15535 + }, + { + "epoch": 1.976338888182165, + "grad_norm": 1.513729453086853, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8841372728347778, + "num_tokens": 592607313.0, + "step": 15536 + }, + { + "epoch": 1.9764660984607556, + "grad_norm": 1.5556762218475342, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8700403571128845, + "num_tokens": 592642188.0, + "step": 15537 + }, + { + "epoch": 1.9765933087393461, + "grad_norm": 1.3844718933105469, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8864094614982605, + "num_tokens": 592683305.0, + "step": 15538 + }, + { + "epoch": 1.9767205190179367, + "grad_norm": 1.4063172340393066, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8831100463867188, + "num_tokens": 592720899.0, + "step": 15539 + }, + { + "epoch": 1.9768477292965272, + "grad_norm": 1.5394008159637451, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8799189329147339, + "num_tokens": 592755167.0, + "step": 15540 + }, + { + "epoch": 1.9769749395751177, + "grad_norm": 1.5271053314208984, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8737855553627014, + "num_tokens": 592790516.0, + "step": 15541 + }, + { + "epoch": 1.977102149853708, + "grad_norm": 1.4995183944702148, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8895835876464844, + "num_tokens": 592825408.0, + "step": 15542 + }, + { + "epoch": 1.9772293601322986, + "grad_norm": 1.5169504880905151, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8722832202911377, + "num_tokens": 592866376.0, + "step": 15543 + }, + { + "epoch": 1.977356570410889, + "grad_norm": 1.440738558769226, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8794635534286499, + "num_tokens": 592903884.0, + "step": 15544 + }, + { + "epoch": 1.9774837806894796, + "grad_norm": 1.4744672775268555, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8624904155731201, + "num_tokens": 592945151.0, + "step": 15545 + }, + { + "epoch": 1.9776109909680701, + "grad_norm": 1.3181473016738892, + "learning_rate": 1e-06, + "loss": 0.2815, + "mean_token_accuracy": 0.8999682664871216, + "num_tokens": 592982736.0, + "step": 15546 + }, + { + "epoch": 1.9777382012466607, + "grad_norm": 1.3939387798309326, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8811218738555908, + "num_tokens": 593024452.0, + "step": 15547 + }, + { + "epoch": 1.9778654115252512, + "grad_norm": 1.3713536262512207, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8863588571548462, + "num_tokens": 593065769.0, + "step": 15548 + }, + { + "epoch": 1.9779926218038417, + "grad_norm": 1.6136600971221924, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8710321187973022, + "num_tokens": 593099557.0, + "step": 15549 + }, + { + "epoch": 1.9781198320824323, + "grad_norm": 1.449412226676941, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8693403005599976, + "num_tokens": 593144911.0, + "step": 15550 + }, + { + "epoch": 1.9782470423610228, + "grad_norm": 1.5600589513778687, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8922901153564453, + "num_tokens": 593181827.0, + "step": 15551 + }, + { + "epoch": 1.9783742526396133, + "grad_norm": 1.3579020500183105, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8898849487304688, + "num_tokens": 593222862.0, + "step": 15552 + }, + { + "epoch": 1.9785014629182038, + "grad_norm": 1.365990161895752, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8838946223258972, + "num_tokens": 593265120.0, + "step": 15553 + }, + { + "epoch": 1.9786286731967944, + "grad_norm": 1.4086782932281494, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8817986249923706, + "num_tokens": 593303923.0, + "step": 15554 + }, + { + "epoch": 1.978755883475385, + "grad_norm": 1.4451404809951782, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8782588243484497, + "num_tokens": 593346663.0, + "step": 15555 + }, + { + "epoch": 1.9788830937539754, + "grad_norm": 1.3817657232284546, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8786712884902954, + "num_tokens": 593387645.0, + "step": 15556 + }, + { + "epoch": 1.979010304032566, + "grad_norm": 1.4674410820007324, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8601944446563721, + "num_tokens": 593431179.0, + "step": 15557 + }, + { + "epoch": 1.9791375143111565, + "grad_norm": 1.626950740814209, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8652023077011108, + "num_tokens": 593467803.0, + "step": 15558 + }, + { + "epoch": 1.979264724589747, + "grad_norm": 1.5126584768295288, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8772527575492859, + "num_tokens": 593501765.0, + "step": 15559 + }, + { + "epoch": 1.9793919348683373, + "grad_norm": 1.5293960571289062, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8742708563804626, + "num_tokens": 593539905.0, + "step": 15560 + }, + { + "epoch": 1.9795191451469278, + "grad_norm": 1.4900561571121216, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8726879358291626, + "num_tokens": 593575848.0, + "step": 15561 + }, + { + "epoch": 1.9796463554255184, + "grad_norm": 1.4698808193206787, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8759211301803589, + "num_tokens": 593616873.0, + "step": 15562 + }, + { + "epoch": 1.979773565704109, + "grad_norm": 1.635180115699768, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8626739382743835, + "num_tokens": 593652093.0, + "step": 15563 + }, + { + "epoch": 1.9799007759826994, + "grad_norm": 1.444390058517456, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8674955368041992, + "num_tokens": 593694145.0, + "step": 15564 + }, + { + "epoch": 1.98002798626129, + "grad_norm": 1.4607654809951782, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8724467754364014, + "num_tokens": 593736126.0, + "step": 15565 + }, + { + "epoch": 1.9801551965398803, + "grad_norm": 1.4584629535675049, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8817173838615417, + "num_tokens": 593772289.0, + "step": 15566 + }, + { + "epoch": 1.9802824068184708, + "grad_norm": 1.4052709341049194, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.885574460029602, + "num_tokens": 593816526.0, + "step": 15567 + }, + { + "epoch": 1.9804096170970613, + "grad_norm": 1.45283043384552, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8718008399009705, + "num_tokens": 593856499.0, + "step": 15568 + }, + { + "epoch": 1.9805368273756518, + "grad_norm": 1.4094911813735962, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8849202394485474, + "num_tokens": 593896024.0, + "step": 15569 + }, + { + "epoch": 1.9806640376542424, + "grad_norm": 1.3303438425064087, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.8946119546890259, + "num_tokens": 593936508.0, + "step": 15570 + }, + { + "epoch": 1.980791247932833, + "grad_norm": 1.3489340543746948, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8837407827377319, + "num_tokens": 593984176.0, + "step": 15571 + }, + { + "epoch": 1.9809184582114234, + "grad_norm": 1.52315354347229, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8778805136680603, + "num_tokens": 594024368.0, + "step": 15572 + }, + { + "epoch": 1.981045668490014, + "grad_norm": 1.509140968322754, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8757827877998352, + "num_tokens": 594062533.0, + "step": 15573 + }, + { + "epoch": 1.9811728787686045, + "grad_norm": 1.5199123620986938, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8845973610877991, + "num_tokens": 594097039.0, + "step": 15574 + }, + { + "epoch": 1.981300089047195, + "grad_norm": 1.3894774913787842, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8772785663604736, + "num_tokens": 594137426.0, + "step": 15575 + }, + { + "epoch": 1.9814272993257855, + "grad_norm": 1.5000351667404175, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8866479396820068, + "num_tokens": 594173433.0, + "step": 15576 + }, + { + "epoch": 1.981554509604376, + "grad_norm": 1.4470114707946777, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8790907263755798, + "num_tokens": 594213864.0, + "step": 15577 + }, + { + "epoch": 1.9816817198829666, + "grad_norm": 1.680186152458191, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.857845664024353, + "num_tokens": 594248050.0, + "step": 15578 + }, + { + "epoch": 1.9818089301615571, + "grad_norm": 1.6411254405975342, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8649482727050781, + "num_tokens": 594282686.0, + "step": 15579 + }, + { + "epoch": 1.9819361404401477, + "grad_norm": 1.6303776502609253, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8737770915031433, + "num_tokens": 594318355.0, + "step": 15580 + }, + { + "epoch": 1.9820633507187382, + "grad_norm": 1.3836643695831299, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8810608386993408, + "num_tokens": 594362117.0, + "step": 15581 + }, + { + "epoch": 1.9821905609973287, + "grad_norm": 1.5227245092391968, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.8953063488006592, + "num_tokens": 594397749.0, + "step": 15582 + }, + { + "epoch": 1.9823177712759192, + "grad_norm": 1.4428787231445312, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8825190663337708, + "num_tokens": 594435518.0, + "step": 15583 + }, + { + "epoch": 1.9824449815545098, + "grad_norm": 1.5159399509429932, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8865488767623901, + "num_tokens": 594469362.0, + "step": 15584 + }, + { + "epoch": 1.9825721918331, + "grad_norm": 1.4540528059005737, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8802146911621094, + "num_tokens": 594506661.0, + "step": 15585 + }, + { + "epoch": 1.9826994021116906, + "grad_norm": 1.557878017425537, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8699866533279419, + "num_tokens": 594547151.0, + "step": 15586 + }, + { + "epoch": 1.9828266123902811, + "grad_norm": 1.3066107034683228, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8821383714675903, + "num_tokens": 594595142.0, + "step": 15587 + }, + { + "epoch": 1.9829538226688717, + "grad_norm": 1.5020400285720825, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8770349025726318, + "num_tokens": 594631769.0, + "step": 15588 + }, + { + "epoch": 1.9830810329474622, + "grad_norm": 1.3640820980072021, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8739228248596191, + "num_tokens": 594676407.0, + "step": 15589 + }, + { + "epoch": 1.9832082432260527, + "grad_norm": 1.4200173616409302, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8796586990356445, + "num_tokens": 594718687.0, + "step": 15590 + }, + { + "epoch": 1.983335453504643, + "grad_norm": 1.4519093036651611, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8856697082519531, + "num_tokens": 594759386.0, + "step": 15591 + }, + { + "epoch": 1.9834626637832335, + "grad_norm": 1.4176342487335205, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8769135475158691, + "num_tokens": 594800416.0, + "step": 15592 + }, + { + "epoch": 1.983589874061824, + "grad_norm": 1.3791486024856567, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8874063491821289, + "num_tokens": 594836482.0, + "step": 15593 + }, + { + "epoch": 1.9837170843404146, + "grad_norm": 1.4323526620864868, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8901032209396362, + "num_tokens": 594872930.0, + "step": 15594 + }, + { + "epoch": 1.9838442946190051, + "grad_norm": 1.4078956842422485, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8848804235458374, + "num_tokens": 594913405.0, + "step": 15595 + }, + { + "epoch": 1.9839715048975957, + "grad_norm": 1.628772497177124, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8747374415397644, + "num_tokens": 594949583.0, + "step": 15596 + }, + { + "epoch": 1.9840987151761862, + "grad_norm": 1.5583178997039795, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8565560579299927, + "num_tokens": 594989420.0, + "step": 15597 + }, + { + "epoch": 1.9842259254547767, + "grad_norm": 1.3876157999038696, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8843954205513, + "num_tokens": 595033849.0, + "step": 15598 + }, + { + "epoch": 1.9843531357333672, + "grad_norm": 1.462360143661499, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8798327445983887, + "num_tokens": 595074539.0, + "step": 15599 + }, + { + "epoch": 1.9844803460119578, + "grad_norm": 1.4059852361679077, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8807402849197388, + "num_tokens": 595115649.0, + "step": 15600 + }, + { + "epoch": 1.9846075562905483, + "grad_norm": 1.5937031507492065, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8875438570976257, + "num_tokens": 595147273.0, + "step": 15601 + }, + { + "epoch": 1.9847347665691388, + "grad_norm": 1.498852014541626, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8781021237373352, + "num_tokens": 595184672.0, + "step": 15602 + }, + { + "epoch": 1.9848619768477294, + "grad_norm": 1.5747199058532715, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8760421872138977, + "num_tokens": 595225247.0, + "step": 15603 + }, + { + "epoch": 1.9849891871263199, + "grad_norm": 1.5590049028396606, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8738236427307129, + "num_tokens": 595263575.0, + "step": 15604 + }, + { + "epoch": 1.9851163974049104, + "grad_norm": 1.3404771089553833, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8840246200561523, + "num_tokens": 595306672.0, + "step": 15605 + }, + { + "epoch": 1.985243607683501, + "grad_norm": 1.5142953395843506, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8873552083969116, + "num_tokens": 595340424.0, + "step": 15606 + }, + { + "epoch": 1.9853708179620915, + "grad_norm": 1.4435608386993408, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8840499520301819, + "num_tokens": 595381409.0, + "step": 15607 + }, + { + "epoch": 1.985498028240682, + "grad_norm": 1.511547327041626, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.89518803358078, + "num_tokens": 595416815.0, + "step": 15608 + }, + { + "epoch": 1.9856252385192723, + "grad_norm": 1.4188183546066284, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8852211236953735, + "num_tokens": 595454151.0, + "step": 15609 + }, + { + "epoch": 1.9857524487978628, + "grad_norm": 1.404724359512329, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8933484554290771, + "num_tokens": 595491409.0, + "step": 15610 + }, + { + "epoch": 1.9858796590764534, + "grad_norm": 1.4890110492706299, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8713377714157104, + "num_tokens": 595532313.0, + "step": 15611 + }, + { + "epoch": 1.986006869355044, + "grad_norm": 1.4638957977294922, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8769396543502808, + "num_tokens": 595568546.0, + "step": 15612 + }, + { + "epoch": 1.9861340796336344, + "grad_norm": 1.6693230867385864, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8627467751502991, + "num_tokens": 595604321.0, + "step": 15613 + }, + { + "epoch": 1.986261289912225, + "grad_norm": 1.4001824855804443, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8731966018676758, + "num_tokens": 595645054.0, + "step": 15614 + }, + { + "epoch": 1.9863885001908153, + "grad_norm": 1.5231902599334717, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8784276247024536, + "num_tokens": 595682402.0, + "step": 15615 + }, + { + "epoch": 1.9865157104694058, + "grad_norm": 1.5268439054489136, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.868847131729126, + "num_tokens": 595721030.0, + "step": 15616 + }, + { + "epoch": 1.9866429207479963, + "grad_norm": 1.3762296438217163, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8944894075393677, + "num_tokens": 595762408.0, + "step": 15617 + }, + { + "epoch": 1.9867701310265868, + "grad_norm": 1.4412139654159546, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8757724761962891, + "num_tokens": 595801710.0, + "step": 15618 + }, + { + "epoch": 1.9868973413051774, + "grad_norm": 1.4460622072219849, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.884655237197876, + "num_tokens": 595840441.0, + "step": 15619 + }, + { + "epoch": 1.987024551583768, + "grad_norm": 1.4718269109725952, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8841536045074463, + "num_tokens": 595874402.0, + "step": 15620 + }, + { + "epoch": 1.9871517618623584, + "grad_norm": 1.5726279020309448, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8577391505241394, + "num_tokens": 595911017.0, + "step": 15621 + }, + { + "epoch": 1.987278972140949, + "grad_norm": 1.497164011001587, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8828833103179932, + "num_tokens": 595947792.0, + "step": 15622 + }, + { + "epoch": 1.9874061824195395, + "grad_norm": 1.4566198587417603, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8684942722320557, + "num_tokens": 595986354.0, + "step": 15623 + }, + { + "epoch": 1.98753339269813, + "grad_norm": 1.4892476797103882, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8924564123153687, + "num_tokens": 596021360.0, + "step": 15624 + }, + { + "epoch": 1.9876606029767205, + "grad_norm": 1.4166971445083618, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.889327883720398, + "num_tokens": 596057897.0, + "step": 15625 + }, + { + "epoch": 1.987787813255311, + "grad_norm": 1.396102786064148, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.88402259349823, + "num_tokens": 596097701.0, + "step": 15626 + }, + { + "epoch": 1.9879150235339016, + "grad_norm": 1.547674536705017, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8692078590393066, + "num_tokens": 596134642.0, + "step": 15627 + }, + { + "epoch": 1.9880422338124921, + "grad_norm": 1.4824976921081543, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8856097459793091, + "num_tokens": 596171173.0, + "step": 15628 + }, + { + "epoch": 1.9881694440910826, + "grad_norm": 1.499135136604309, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8759716153144836, + "num_tokens": 596208763.0, + "step": 15629 + }, + { + "epoch": 1.9882966543696732, + "grad_norm": 1.560296893119812, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8889431953430176, + "num_tokens": 596241147.0, + "step": 15630 + }, + { + "epoch": 1.9884238646482637, + "grad_norm": 1.533815860748291, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8872382640838623, + "num_tokens": 596279376.0, + "step": 15631 + }, + { + "epoch": 1.9885510749268542, + "grad_norm": 1.3611102104187012, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8792790174484253, + "num_tokens": 596321467.0, + "step": 15632 + }, + { + "epoch": 1.9886782852054448, + "grad_norm": 1.4360246658325195, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8788374662399292, + "num_tokens": 596362819.0, + "step": 15633 + }, + { + "epoch": 1.988805495484035, + "grad_norm": 1.3918907642364502, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8807061314582825, + "num_tokens": 596405442.0, + "step": 15634 + }, + { + "epoch": 1.9889327057626256, + "grad_norm": 1.4639757871627808, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8694777488708496, + "num_tokens": 596445443.0, + "step": 15635 + }, + { + "epoch": 1.9890599160412161, + "grad_norm": 1.4743459224700928, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8687623739242554, + "num_tokens": 596485518.0, + "step": 15636 + }, + { + "epoch": 1.9891871263198067, + "grad_norm": 1.5797436237335205, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8773989081382751, + "num_tokens": 596522112.0, + "step": 15637 + }, + { + "epoch": 1.9893143365983972, + "grad_norm": 1.5666590929031372, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8932422399520874, + "num_tokens": 596554210.0, + "step": 15638 + }, + { + "epoch": 1.9894415468769877, + "grad_norm": 1.5984419584274292, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.879299521446228, + "num_tokens": 596588926.0, + "step": 15639 + }, + { + "epoch": 1.989568757155578, + "grad_norm": 1.4291465282440186, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8759497404098511, + "num_tokens": 596630836.0, + "step": 15640 + }, + { + "epoch": 1.9896959674341685, + "grad_norm": 1.4551769495010376, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8865864276885986, + "num_tokens": 596670973.0, + "step": 15641 + }, + { + "epoch": 1.989823177712759, + "grad_norm": 1.4421201944351196, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.869464635848999, + "num_tokens": 596713951.0, + "step": 15642 + }, + { + "epoch": 1.9899503879913496, + "grad_norm": 1.4232152700424194, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8885862231254578, + "num_tokens": 596754278.0, + "step": 15643 + }, + { + "epoch": 1.9900775982699401, + "grad_norm": 1.4363526105880737, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8895190954208374, + "num_tokens": 596787878.0, + "step": 15644 + }, + { + "epoch": 1.9902048085485307, + "grad_norm": 1.4848542213439941, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8728501200675964, + "num_tokens": 596830327.0, + "step": 15645 + }, + { + "epoch": 1.9903320188271212, + "grad_norm": 1.5305715799331665, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.870131254196167, + "num_tokens": 596869300.0, + "step": 15646 + }, + { + "epoch": 1.9904592291057117, + "grad_norm": 1.4929934740066528, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8794428110122681, + "num_tokens": 596911355.0, + "step": 15647 + }, + { + "epoch": 1.9905864393843022, + "grad_norm": 1.435906171798706, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8733923435211182, + "num_tokens": 596948827.0, + "step": 15648 + }, + { + "epoch": 1.9907136496628928, + "grad_norm": 1.5430293083190918, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8801991939544678, + "num_tokens": 596980210.0, + "step": 15649 + }, + { + "epoch": 1.9908408599414833, + "grad_norm": 1.407537579536438, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8765658140182495, + "num_tokens": 597021249.0, + "step": 15650 + }, + { + "epoch": 1.9909680702200738, + "grad_norm": 1.3698183298110962, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.8964433670043945, + "num_tokens": 597062288.0, + "step": 15651 + }, + { + "epoch": 1.9910952804986644, + "grad_norm": 1.4048064947128296, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8751333951950073, + "num_tokens": 597106091.0, + "step": 15652 + }, + { + "epoch": 1.9912224907772549, + "grad_norm": 1.3881505727767944, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8739657402038574, + "num_tokens": 597149125.0, + "step": 15653 + }, + { + "epoch": 1.9913497010558454, + "grad_norm": 1.4701136350631714, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8822306394577026, + "num_tokens": 597188375.0, + "step": 15654 + }, + { + "epoch": 1.991476911334436, + "grad_norm": 1.4797307252883911, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8802075386047363, + "num_tokens": 597229264.0, + "step": 15655 + }, + { + "epoch": 1.9916041216130265, + "grad_norm": 1.5246031284332275, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8724891543388367, + "num_tokens": 597268805.0, + "step": 15656 + }, + { + "epoch": 1.991731331891617, + "grad_norm": 1.4084590673446655, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.873993992805481, + "num_tokens": 597310265.0, + "step": 15657 + }, + { + "epoch": 1.9918585421702073, + "grad_norm": 1.4786773920059204, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8883038759231567, + "num_tokens": 597345599.0, + "step": 15658 + }, + { + "epoch": 1.9919857524487978, + "grad_norm": 1.742117166519165, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8782045841217041, + "num_tokens": 597381658.0, + "step": 15659 + }, + { + "epoch": 1.9921129627273884, + "grad_norm": 1.4489532709121704, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.882018506526947, + "num_tokens": 597421044.0, + "step": 15660 + }, + { + "epoch": 1.9922401730059789, + "grad_norm": 1.5207676887512207, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8818772435188293, + "num_tokens": 597457742.0, + "step": 15661 + }, + { + "epoch": 1.9923673832845694, + "grad_norm": 1.3831586837768555, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8705393075942993, + "num_tokens": 597504927.0, + "step": 15662 + }, + { + "epoch": 1.99249459356316, + "grad_norm": 1.5479936599731445, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8719329833984375, + "num_tokens": 597541936.0, + "step": 15663 + }, + { + "epoch": 1.9926218038417502, + "grad_norm": 1.765191912651062, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8686318397521973, + "num_tokens": 597571451.0, + "step": 15664 + }, + { + "epoch": 1.9927490141203408, + "grad_norm": 1.4710835218429565, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8845105171203613, + "num_tokens": 597612639.0, + "step": 15665 + }, + { + "epoch": 1.9928762243989313, + "grad_norm": 1.5885865688323975, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8736696243286133, + "num_tokens": 597646871.0, + "step": 15666 + }, + { + "epoch": 1.9930034346775218, + "grad_norm": 1.3926377296447754, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8820051550865173, + "num_tokens": 597687873.0, + "step": 15667 + }, + { + "epoch": 1.9931306449561124, + "grad_norm": 1.6171183586120605, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8719048500061035, + "num_tokens": 597720684.0, + "step": 15668 + }, + { + "epoch": 1.993257855234703, + "grad_norm": 1.6054960489273071, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8571243286132812, + "num_tokens": 597761439.0, + "step": 15669 + }, + { + "epoch": 1.9933850655132934, + "grad_norm": 1.4206571578979492, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8816616535186768, + "num_tokens": 597800823.0, + "step": 15670 + }, + { + "epoch": 1.993512275791884, + "grad_norm": 1.5102863311767578, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8895176649093628, + "num_tokens": 597834708.0, + "step": 15671 + }, + { + "epoch": 1.9936394860704745, + "grad_norm": 1.5672156810760498, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8972298502922058, + "num_tokens": 597863569.0, + "step": 15672 + }, + { + "epoch": 1.993766696349065, + "grad_norm": 1.3862409591674805, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8777851462364197, + "num_tokens": 597906560.0, + "step": 15673 + }, + { + "epoch": 1.9938939066276555, + "grad_norm": 1.3282047510147095, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8830223083496094, + "num_tokens": 597950479.0, + "step": 15674 + }, + { + "epoch": 1.994021116906246, + "grad_norm": 1.3559199571609497, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8793213367462158, + "num_tokens": 597994632.0, + "step": 15675 + }, + { + "epoch": 1.9941483271848366, + "grad_norm": 1.5320366621017456, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8777378797531128, + "num_tokens": 598030390.0, + "step": 15676 + }, + { + "epoch": 1.9942755374634271, + "grad_norm": 1.4718737602233887, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8862512707710266, + "num_tokens": 598067682.0, + "step": 15677 + }, + { + "epoch": 1.9944027477420176, + "grad_norm": 1.5384682416915894, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8785115480422974, + "num_tokens": 598106769.0, + "step": 15678 + }, + { + "epoch": 1.9945299580206082, + "grad_norm": 1.5420852899551392, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8866477012634277, + "num_tokens": 598143916.0, + "step": 15679 + }, + { + "epoch": 1.9946571682991987, + "grad_norm": 1.657150387763977, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8752055168151855, + "num_tokens": 598180948.0, + "step": 15680 + }, + { + "epoch": 1.9947843785777892, + "grad_norm": 1.4698901176452637, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8819544911384583, + "num_tokens": 598219137.0, + "step": 15681 + }, + { + "epoch": 1.9949115888563798, + "grad_norm": 1.5285629034042358, + "learning_rate": 1e-06, + "loss": 0.2804, + "mean_token_accuracy": 0.8968524932861328, + "num_tokens": 598258264.0, + "step": 15682 + }, + { + "epoch": 1.99503879913497, + "grad_norm": 1.6003776788711548, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8822548985481262, + "num_tokens": 598292991.0, + "step": 15683 + }, + { + "epoch": 1.9951660094135606, + "grad_norm": 1.5056699514389038, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8765867948532104, + "num_tokens": 598332232.0, + "step": 15684 + }, + { + "epoch": 1.9952932196921511, + "grad_norm": 1.465809941291809, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8673054575920105, + "num_tokens": 598372980.0, + "step": 15685 + }, + { + "epoch": 1.9954204299707416, + "grad_norm": 1.5693930387496948, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8775544762611389, + "num_tokens": 598406362.0, + "step": 15686 + }, + { + "epoch": 1.9955476402493322, + "grad_norm": 1.5062910318374634, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8841763734817505, + "num_tokens": 598442127.0, + "step": 15687 + }, + { + "epoch": 1.9956748505279227, + "grad_norm": 1.3126469850540161, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8797900676727295, + "num_tokens": 598490677.0, + "step": 15688 + }, + { + "epoch": 1.995802060806513, + "grad_norm": 1.6201586723327637, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8906595706939697, + "num_tokens": 598520970.0, + "step": 15689 + }, + { + "epoch": 1.9959292710851035, + "grad_norm": 1.404739260673523, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8822084665298462, + "num_tokens": 598562514.0, + "step": 15690 + }, + { + "epoch": 1.996056481363694, + "grad_norm": 1.4931565523147583, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8759890794754028, + "num_tokens": 598602626.0, + "step": 15691 + }, + { + "epoch": 1.9961836916422846, + "grad_norm": 1.4156670570373535, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8733114004135132, + "num_tokens": 598644148.0, + "step": 15692 + }, + { + "epoch": 1.9963109019208751, + "grad_norm": 1.4821404218673706, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8805538415908813, + "num_tokens": 598678758.0, + "step": 15693 + }, + { + "epoch": 1.9964381121994657, + "grad_norm": 1.493179440498352, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8894950151443481, + "num_tokens": 598715512.0, + "step": 15694 + }, + { + "epoch": 1.9965653224780562, + "grad_norm": 1.4062211513519287, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8690311908721924, + "num_tokens": 598758503.0, + "step": 15695 + }, + { + "epoch": 1.9966925327566467, + "grad_norm": 1.4491623640060425, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8783481121063232, + "num_tokens": 598797642.0, + "step": 15696 + }, + { + "epoch": 1.9968197430352372, + "grad_norm": 1.4289042949676514, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8746843338012695, + "num_tokens": 598839931.0, + "step": 15697 + }, + { + "epoch": 1.9969469533138278, + "grad_norm": 1.5177079439163208, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8887600898742676, + "num_tokens": 598874289.0, + "step": 15698 + }, + { + "epoch": 1.9970741635924183, + "grad_norm": 1.4260492324829102, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8751689791679382, + "num_tokens": 598914946.0, + "step": 15699 + }, + { + "epoch": 1.9972013738710088, + "grad_norm": 1.4418798685073853, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8826972246170044, + "num_tokens": 598950888.0, + "step": 15700 + }, + { + "epoch": 1.9973285841495994, + "grad_norm": 1.4971596002578735, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8699281811714172, + "num_tokens": 598989563.0, + "step": 15701 + }, + { + "epoch": 1.9974557944281899, + "grad_norm": 1.4470831155776978, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8884483575820923, + "num_tokens": 599023689.0, + "step": 15702 + }, + { + "epoch": 1.9975830047067804, + "grad_norm": 1.4509328603744507, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8728194832801819, + "num_tokens": 599061964.0, + "step": 15703 + }, + { + "epoch": 1.997710214985371, + "grad_norm": 1.3823554515838623, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8947388529777527, + "num_tokens": 599102836.0, + "step": 15704 + }, + { + "epoch": 1.9978374252639615, + "grad_norm": 1.4817242622375488, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8744277954101562, + "num_tokens": 599142429.0, + "step": 15705 + }, + { + "epoch": 1.997964635542552, + "grad_norm": 1.6271772384643555, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8822089433670044, + "num_tokens": 599177815.0, + "step": 15706 + }, + { + "epoch": 1.9980918458211423, + "grad_norm": 1.4257094860076904, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8838266134262085, + "num_tokens": 599215473.0, + "step": 15707 + }, + { + "epoch": 1.9982190560997328, + "grad_norm": 1.4596024751663208, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8729966282844543, + "num_tokens": 599255987.0, + "step": 15708 + }, + { + "epoch": 1.9983462663783234, + "grad_norm": 1.5930125713348389, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8819162845611572, + "num_tokens": 599289592.0, + "step": 15709 + }, + { + "epoch": 1.9984734766569139, + "grad_norm": 1.4868119955062866, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8918942809104919, + "num_tokens": 599322651.0, + "step": 15710 + }, + { + "epoch": 1.9986006869355044, + "grad_norm": 1.5208653211593628, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.878044605255127, + "num_tokens": 599357802.0, + "step": 15711 + }, + { + "epoch": 1.998727897214095, + "grad_norm": 1.5985525846481323, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8750433921813965, + "num_tokens": 599393825.0, + "step": 15712 + }, + { + "epoch": 1.9988551074926852, + "grad_norm": 1.7152918577194214, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8784284591674805, + "num_tokens": 599424596.0, + "step": 15713 + }, + { + "epoch": 1.9989823177712758, + "grad_norm": 1.5467805862426758, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8672037124633789, + "num_tokens": 599460534.0, + "step": 15714 + }, + { + "epoch": 1.9991095280498663, + "grad_norm": 1.4657646417617798, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8902095556259155, + "num_tokens": 599495758.0, + "step": 15715 + }, + { + "epoch": 1.9992367383284568, + "grad_norm": 1.4988203048706055, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.879982054233551, + "num_tokens": 599532103.0, + "step": 15716 + }, + { + "epoch": 1.9993639486070474, + "grad_norm": 1.4492818117141724, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8775940537452698, + "num_tokens": 599572200.0, + "step": 15717 + }, + { + "epoch": 1.9994911588856379, + "grad_norm": 1.4922294616699219, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8785918951034546, + "num_tokens": 599611882.0, + "step": 15718 + }, + { + "epoch": 1.9996183691642284, + "grad_norm": 1.3290767669677734, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.89484703540802, + "num_tokens": 599652369.0, + "step": 15719 + }, + { + "epoch": 1.999745579442819, + "grad_norm": 1.436208963394165, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8694871068000793, + "num_tokens": 599695407.0, + "step": 15720 + }, + { + "epoch": 1.9998727897214095, + "grad_norm": 1.356650948524475, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8753056526184082, + "num_tokens": 599737531.0, + "step": 15721 + }, + { + "epoch": 2.0, + "grad_norm": 1.4971895217895508, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8852971792221069, + "num_tokens": 599777940.0, + "step": 15722 + }, + { + "epoch": 2.0001272102785905, + "grad_norm": 1.4226839542388916, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8943090438842773, + "num_tokens": 599814586.0, + "step": 15723 + }, + { + "epoch": 2.000254420557181, + "grad_norm": 1.3737350702285767, + "learning_rate": 1e-06, + "loss": 0.2715, + "mean_token_accuracy": 0.8993701338768005, + "num_tokens": 599853859.0, + "step": 15724 + }, + { + "epoch": 2.0003816308357716, + "grad_norm": 1.5610830783843994, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8781592845916748, + "num_tokens": 599886515.0, + "step": 15725 + }, + { + "epoch": 2.000508841114362, + "grad_norm": 1.4406583309173584, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8815135955810547, + "num_tokens": 599925880.0, + "step": 15726 + }, + { + "epoch": 2.0006360513929526, + "grad_norm": 1.4970542192459106, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8808409571647644, + "num_tokens": 599965551.0, + "step": 15727 + }, + { + "epoch": 2.000763261671543, + "grad_norm": 1.3884460926055908, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8814667463302612, + "num_tokens": 600007936.0, + "step": 15728 + }, + { + "epoch": 2.0008904719501337, + "grad_norm": 1.6037449836730957, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8876569271087646, + "num_tokens": 600044867.0, + "step": 15729 + }, + { + "epoch": 2.0010176822287242, + "grad_norm": 1.5470741987228394, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8732256889343262, + "num_tokens": 600082606.0, + "step": 15730 + }, + { + "epoch": 2.0011448925073148, + "grad_norm": 1.5776978731155396, + "learning_rate": 1e-06, + "loss": 0.2713, + "mean_token_accuracy": 0.8995474576950073, + "num_tokens": 600115900.0, + "step": 15731 + }, + { + "epoch": 2.0012721027859053, + "grad_norm": 1.6169946193695068, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8782704472541809, + "num_tokens": 600154559.0, + "step": 15732 + }, + { + "epoch": 2.001399313064496, + "grad_norm": 1.4953490495681763, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8733571171760559, + "num_tokens": 600200557.0, + "step": 15733 + }, + { + "epoch": 2.0015265233430863, + "grad_norm": 1.5121936798095703, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8940211534500122, + "num_tokens": 600242416.0, + "step": 15734 + }, + { + "epoch": 2.0016537336216764, + "grad_norm": 1.7045210599899292, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8857243061065674, + "num_tokens": 600276834.0, + "step": 15735 + }, + { + "epoch": 2.001780943900267, + "grad_norm": 1.6295207738876343, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8759682178497314, + "num_tokens": 600314209.0, + "step": 15736 + }, + { + "epoch": 2.0019081541788575, + "grad_norm": 1.5881481170654297, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8865171074867249, + "num_tokens": 600352155.0, + "step": 15737 + }, + { + "epoch": 2.002035364457448, + "grad_norm": 1.547908902168274, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8846204280853271, + "num_tokens": 600387656.0, + "step": 15738 + }, + { + "epoch": 2.0021625747360385, + "grad_norm": 1.6041091680526733, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8890807032585144, + "num_tokens": 600422190.0, + "step": 15739 + }, + { + "epoch": 2.002289785014629, + "grad_norm": 1.53145170211792, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8791144490242004, + "num_tokens": 600459689.0, + "step": 15740 + }, + { + "epoch": 2.0024169952932196, + "grad_norm": 1.6406171321868896, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8874262571334839, + "num_tokens": 600494434.0, + "step": 15741 + }, + { + "epoch": 2.00254420557181, + "grad_norm": 1.5006608963012695, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8905086517333984, + "num_tokens": 600536169.0, + "step": 15742 + }, + { + "epoch": 2.0026714158504006, + "grad_norm": 1.4276386499404907, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8855279684066772, + "num_tokens": 600578085.0, + "step": 15743 + }, + { + "epoch": 2.002798626128991, + "grad_norm": 1.590656042098999, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8770517110824585, + "num_tokens": 600613954.0, + "step": 15744 + }, + { + "epoch": 2.0029258364075817, + "grad_norm": 1.4799647331237793, + "learning_rate": 1e-06, + "loss": 0.2572, + "mean_token_accuracy": 0.9080588817596436, + "num_tokens": 600647728.0, + "step": 15745 + }, + { + "epoch": 2.0030530466861722, + "grad_norm": 1.774918556213379, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8558679223060608, + "num_tokens": 600685224.0, + "step": 15746 + }, + { + "epoch": 2.0031802569647628, + "grad_norm": 1.4065238237380981, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8939646482467651, + "num_tokens": 600724261.0, + "step": 15747 + }, + { + "epoch": 2.0033074672433533, + "grad_norm": 1.7010571956634521, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8794809579849243, + "num_tokens": 600768004.0, + "step": 15748 + }, + { + "epoch": 2.003434677521944, + "grad_norm": 1.5034312009811401, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8974815607070923, + "num_tokens": 600803289.0, + "step": 15749 + }, + { + "epoch": 2.0035618878005343, + "grad_norm": 1.4859726428985596, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8900498151779175, + "num_tokens": 600843845.0, + "step": 15750 + }, + { + "epoch": 2.003689098079125, + "grad_norm": 1.7132863998413086, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8773612380027771, + "num_tokens": 600875559.0, + "step": 15751 + }, + { + "epoch": 2.0038163083577154, + "grad_norm": 1.5735348463058472, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8905426263809204, + "num_tokens": 600911028.0, + "step": 15752 + }, + { + "epoch": 2.003943518636306, + "grad_norm": 1.5793243646621704, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8905030488967896, + "num_tokens": 600948314.0, + "step": 15753 + }, + { + "epoch": 2.0040707289148965, + "grad_norm": 1.5241050720214844, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.886360228061676, + "num_tokens": 600985958.0, + "step": 15754 + }, + { + "epoch": 2.004197939193487, + "grad_norm": 1.6475619077682495, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8842102885246277, + "num_tokens": 601020262.0, + "step": 15755 + }, + { + "epoch": 2.0043251494720775, + "grad_norm": 1.514717936515808, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8960698246955872, + "num_tokens": 601063611.0, + "step": 15756 + }, + { + "epoch": 2.004452359750668, + "grad_norm": 1.617808222770691, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8858340978622437, + "num_tokens": 601096397.0, + "step": 15757 + }, + { + "epoch": 2.0045795700292586, + "grad_norm": 1.391769289970398, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8916900157928467, + "num_tokens": 601137902.0, + "step": 15758 + }, + { + "epoch": 2.0047067803078487, + "grad_norm": 1.3901697397232056, + "learning_rate": 1e-06, + "loss": 0.2752, + "mean_token_accuracy": 0.8991903066635132, + "num_tokens": 601178563.0, + "step": 15759 + }, + { + "epoch": 2.004833990586439, + "grad_norm": 1.6104434728622437, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8682318329811096, + "num_tokens": 601221387.0, + "step": 15760 + }, + { + "epoch": 2.0049612008650297, + "grad_norm": 1.5869144201278687, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8931167125701904, + "num_tokens": 601255152.0, + "step": 15761 + }, + { + "epoch": 2.0050884111436202, + "grad_norm": 1.63206946849823, + "learning_rate": 1e-06, + "loss": 0.2791, + "mean_token_accuracy": 0.8950521945953369, + "num_tokens": 601286210.0, + "step": 15762 + }, + { + "epoch": 2.0052156214222108, + "grad_norm": 1.5826674699783325, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8859329223632812, + "num_tokens": 601325647.0, + "step": 15763 + }, + { + "epoch": 2.0053428317008013, + "grad_norm": 1.5299371480941772, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8915531039237976, + "num_tokens": 601362291.0, + "step": 15764 + }, + { + "epoch": 2.005470041979392, + "grad_norm": 1.4980695247650146, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8848879337310791, + "num_tokens": 601403717.0, + "step": 15765 + }, + { + "epoch": 2.0055972522579824, + "grad_norm": 1.4309475421905518, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8821464776992798, + "num_tokens": 601446052.0, + "step": 15766 + }, + { + "epoch": 2.005724462536573, + "grad_norm": 1.7248015403747559, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8885923027992249, + "num_tokens": 601480380.0, + "step": 15767 + }, + { + "epoch": 2.0058516728151634, + "grad_norm": 1.4910693168640137, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8877254724502563, + "num_tokens": 601519378.0, + "step": 15768 + }, + { + "epoch": 2.005978883093754, + "grad_norm": 1.4516361951828003, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8931937217712402, + "num_tokens": 601558891.0, + "step": 15769 + }, + { + "epoch": 2.0061060933723445, + "grad_norm": 1.6931668519973755, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8821406364440918, + "num_tokens": 601589963.0, + "step": 15770 + }, + { + "epoch": 2.006233303650935, + "grad_norm": 1.5188839435577393, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8883849382400513, + "num_tokens": 601627497.0, + "step": 15771 + }, + { + "epoch": 2.0063605139295255, + "grad_norm": 1.5741616487503052, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8974407315254211, + "num_tokens": 601662498.0, + "step": 15772 + }, + { + "epoch": 2.006487724208116, + "grad_norm": 1.5829473733901978, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8850759267807007, + "num_tokens": 601699524.0, + "step": 15773 + }, + { + "epoch": 2.0066149344867066, + "grad_norm": 1.4912290573120117, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8905491828918457, + "num_tokens": 601737893.0, + "step": 15774 + }, + { + "epoch": 2.006742144765297, + "grad_norm": 1.574880599975586, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8877887725830078, + "num_tokens": 601773660.0, + "step": 15775 + }, + { + "epoch": 2.0068693550438876, + "grad_norm": 1.5664013624191284, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8953481912612915, + "num_tokens": 601813150.0, + "step": 15776 + }, + { + "epoch": 2.006996565322478, + "grad_norm": 1.5845580101013184, + "learning_rate": 1e-06, + "loss": 0.2562, + "mean_token_accuracy": 0.9064221382141113, + "num_tokens": 601848753.0, + "step": 15777 + }, + { + "epoch": 2.0071237756010687, + "grad_norm": 1.6014056205749512, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8845397233963013, + "num_tokens": 601885879.0, + "step": 15778 + }, + { + "epoch": 2.007250985879659, + "grad_norm": 1.5131187438964844, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8865664601325989, + "num_tokens": 601922999.0, + "step": 15779 + }, + { + "epoch": 2.0073781961582498, + "grad_norm": 1.7997277975082397, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8704738020896912, + "num_tokens": 601958022.0, + "step": 15780 + }, + { + "epoch": 2.0075054064368403, + "grad_norm": 1.6956690549850464, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8797770738601685, + "num_tokens": 601992950.0, + "step": 15781 + }, + { + "epoch": 2.007632616715431, + "grad_norm": 1.574600338935852, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8800569176673889, + "num_tokens": 602032871.0, + "step": 15782 + }, + { + "epoch": 2.0077598269940213, + "grad_norm": 1.5612308979034424, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8874496221542358, + "num_tokens": 602069093.0, + "step": 15783 + }, + { + "epoch": 2.0078870372726114, + "grad_norm": 1.624731421470642, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8801441192626953, + "num_tokens": 602102909.0, + "step": 15784 + }, + { + "epoch": 2.008014247551202, + "grad_norm": 1.5987316370010376, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8886526823043823, + "num_tokens": 602138119.0, + "step": 15785 + }, + { + "epoch": 2.0081414578297925, + "grad_norm": 1.652737021446228, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8960142135620117, + "num_tokens": 602173885.0, + "step": 15786 + }, + { + "epoch": 2.008268668108383, + "grad_norm": 1.4456433057785034, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8896929025650024, + "num_tokens": 602219272.0, + "step": 15787 + }, + { + "epoch": 2.0083958783869735, + "grad_norm": 1.5494288206100464, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8731495141983032, + "num_tokens": 602261158.0, + "step": 15788 + }, + { + "epoch": 2.008523088665564, + "grad_norm": 1.5341371297836304, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8932183980941772, + "num_tokens": 602297266.0, + "step": 15789 + }, + { + "epoch": 2.0086502989441546, + "grad_norm": 1.6466825008392334, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8798063397407532, + "num_tokens": 602333759.0, + "step": 15790 + }, + { + "epoch": 2.008777509222745, + "grad_norm": 1.6684632301330566, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8742085099220276, + "num_tokens": 602369317.0, + "step": 15791 + }, + { + "epoch": 2.0089047195013356, + "grad_norm": 1.6357777118682861, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8922070264816284, + "num_tokens": 602403590.0, + "step": 15792 + }, + { + "epoch": 2.009031929779926, + "grad_norm": 1.6224961280822754, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8807774782180786, + "num_tokens": 602439459.0, + "step": 15793 + }, + { + "epoch": 2.0091591400585167, + "grad_norm": 1.5189865827560425, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.889224112033844, + "num_tokens": 602479151.0, + "step": 15794 + }, + { + "epoch": 2.0092863503371072, + "grad_norm": 1.5054256916046143, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8947979807853699, + "num_tokens": 602517029.0, + "step": 15795 + }, + { + "epoch": 2.0094135606156978, + "grad_norm": 1.4627467393875122, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8892834186553955, + "num_tokens": 602558000.0, + "step": 15796 + }, + { + "epoch": 2.0095407708942883, + "grad_norm": 1.4875268936157227, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8885836601257324, + "num_tokens": 602599411.0, + "step": 15797 + }, + { + "epoch": 2.009667981172879, + "grad_norm": 1.3609751462936401, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.882964015007019, + "num_tokens": 602646188.0, + "step": 15798 + }, + { + "epoch": 2.0097951914514693, + "grad_norm": 1.4425675868988037, + "learning_rate": 1e-06, + "loss": 0.2578, + "mean_token_accuracy": 0.9036002159118652, + "num_tokens": 602683739.0, + "step": 15799 + }, + { + "epoch": 2.00992240173006, + "grad_norm": 1.683821678161621, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8890998959541321, + "num_tokens": 602716665.0, + "step": 15800 + }, + { + "epoch": 2.0100496120086504, + "grad_norm": 1.6284675598144531, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8814882040023804, + "num_tokens": 602752527.0, + "step": 15801 + }, + { + "epoch": 2.010176822287241, + "grad_norm": 1.5603692531585693, + "learning_rate": 1e-06, + "loss": 0.2762, + "mean_token_accuracy": 0.8995151519775391, + "num_tokens": 602784134.0, + "step": 15802 + }, + { + "epoch": 2.0103040325658315, + "grad_norm": 1.4468894004821777, + "learning_rate": 1e-06, + "loss": 0.2753, + "mean_token_accuracy": 0.8983863592147827, + "num_tokens": 602823563.0, + "step": 15803 + }, + { + "epoch": 2.010431242844422, + "grad_norm": 1.4469678401947021, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8784599900245667, + "num_tokens": 602867202.0, + "step": 15804 + }, + { + "epoch": 2.0105584531230125, + "grad_norm": 1.5285980701446533, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8769651055335999, + "num_tokens": 602903990.0, + "step": 15805 + }, + { + "epoch": 2.010685663401603, + "grad_norm": 1.5240614414215088, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8825458884239197, + "num_tokens": 602942421.0, + "step": 15806 + }, + { + "epoch": 2.0108128736801936, + "grad_norm": 1.5542112588882446, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8933709859848022, + "num_tokens": 602977797.0, + "step": 15807 + }, + { + "epoch": 2.0109400839587837, + "grad_norm": 1.5211341381072998, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.8952385783195496, + "num_tokens": 603015188.0, + "step": 15808 + }, + { + "epoch": 2.011067294237374, + "grad_norm": 1.531848430633545, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8786080479621887, + "num_tokens": 603055113.0, + "step": 15809 + }, + { + "epoch": 2.0111945045159647, + "grad_norm": 1.602105736732483, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8897479176521301, + "num_tokens": 603088829.0, + "step": 15810 + }, + { + "epoch": 2.0113217147945552, + "grad_norm": 1.5073710680007935, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8871371150016785, + "num_tokens": 603127443.0, + "step": 15811 + }, + { + "epoch": 2.0114489250731458, + "grad_norm": 1.5918716192245483, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8750307559967041, + "num_tokens": 603166279.0, + "step": 15812 + }, + { + "epoch": 2.0115761353517363, + "grad_norm": 1.463331937789917, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8911328911781311, + "num_tokens": 603210769.0, + "step": 15813 + }, + { + "epoch": 2.011703345630327, + "grad_norm": 1.477583408355713, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8906010389328003, + "num_tokens": 603249826.0, + "step": 15814 + }, + { + "epoch": 2.0118305559089174, + "grad_norm": 1.5641112327575684, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8822070360183716, + "num_tokens": 603287722.0, + "step": 15815 + }, + { + "epoch": 2.011957766187508, + "grad_norm": 1.5061792135238647, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8836703300476074, + "num_tokens": 603325832.0, + "step": 15816 + }, + { + "epoch": 2.0120849764660984, + "grad_norm": 1.4849220514297485, + "learning_rate": 1e-06, + "loss": 0.2776, + "mean_token_accuracy": 0.8977190256118774, + "num_tokens": 603362905.0, + "step": 15817 + }, + { + "epoch": 2.012212186744689, + "grad_norm": 1.5822863578796387, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8755651712417603, + "num_tokens": 603399514.0, + "step": 15818 + }, + { + "epoch": 2.0123393970232795, + "grad_norm": 1.421960711479187, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8963382840156555, + "num_tokens": 603440645.0, + "step": 15819 + }, + { + "epoch": 2.01246660730187, + "grad_norm": 1.5415241718292236, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.880861759185791, + "num_tokens": 603480055.0, + "step": 15820 + }, + { + "epoch": 2.0125938175804605, + "grad_norm": 1.3725221157073975, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8896197080612183, + "num_tokens": 603526792.0, + "step": 15821 + }, + { + "epoch": 2.012721027859051, + "grad_norm": 1.3599915504455566, + "learning_rate": 1e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.8999543190002441, + "num_tokens": 603567844.0, + "step": 15822 + }, + { + "epoch": 2.0128482381376416, + "grad_norm": 1.4276398420333862, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8822600841522217, + "num_tokens": 603610549.0, + "step": 15823 + }, + { + "epoch": 2.012975448416232, + "grad_norm": 1.4784345626831055, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8852444887161255, + "num_tokens": 603651652.0, + "step": 15824 + }, + { + "epoch": 2.0131026586948226, + "grad_norm": 1.516679286956787, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8953676223754883, + "num_tokens": 603688671.0, + "step": 15825 + }, + { + "epoch": 2.013229868973413, + "grad_norm": 1.6142339706420898, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8846814632415771, + "num_tokens": 603725009.0, + "step": 15826 + }, + { + "epoch": 2.0133570792520037, + "grad_norm": 1.6832300424575806, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8726459741592407, + "num_tokens": 603761938.0, + "step": 15827 + }, + { + "epoch": 2.013484289530594, + "grad_norm": 1.4549100399017334, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8875678181648254, + "num_tokens": 603803861.0, + "step": 15828 + }, + { + "epoch": 2.0136114998091847, + "grad_norm": 1.6348559856414795, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.876335620880127, + "num_tokens": 603842109.0, + "step": 15829 + }, + { + "epoch": 2.0137387100877753, + "grad_norm": 1.575401782989502, + "learning_rate": 1e-06, + "loss": 0.2831, + "mean_token_accuracy": 0.8981152176856995, + "num_tokens": 603875136.0, + "step": 15830 + }, + { + "epoch": 2.013865920366366, + "grad_norm": 1.5164164304733276, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8704811334609985, + "num_tokens": 603918250.0, + "step": 15831 + }, + { + "epoch": 2.0139931306449563, + "grad_norm": 1.575191855430603, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8702765107154846, + "num_tokens": 603958564.0, + "step": 15832 + }, + { + "epoch": 2.0141203409235464, + "grad_norm": 1.4808359146118164, + "learning_rate": 1e-06, + "loss": 0.2774, + "mean_token_accuracy": 0.8997020721435547, + "num_tokens": 603996541.0, + "step": 15833 + }, + { + "epoch": 2.014247551202137, + "grad_norm": 1.4679018259048462, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8787535429000854, + "num_tokens": 604038281.0, + "step": 15834 + }, + { + "epoch": 2.0143747614807275, + "grad_norm": 1.6042367219924927, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8852860927581787, + "num_tokens": 604075360.0, + "step": 15835 + }, + { + "epoch": 2.014501971759318, + "grad_norm": 1.7510268688201904, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8837781548500061, + "num_tokens": 604108151.0, + "step": 15836 + }, + { + "epoch": 2.0146291820379085, + "grad_norm": 1.6100982427597046, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.874016284942627, + "num_tokens": 604149776.0, + "step": 15837 + }, + { + "epoch": 2.014756392316499, + "grad_norm": 1.5235207080841064, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.8968170881271362, + "num_tokens": 604187417.0, + "step": 15838 + }, + { + "epoch": 2.0148836025950896, + "grad_norm": 1.606062412261963, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8826674222946167, + "num_tokens": 604225462.0, + "step": 15839 + }, + { + "epoch": 2.01501081287368, + "grad_norm": 1.5717726945877075, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8871492147445679, + "num_tokens": 604263154.0, + "step": 15840 + }, + { + "epoch": 2.0151380231522706, + "grad_norm": 1.4078830480575562, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8840617537498474, + "num_tokens": 604307594.0, + "step": 15841 + }, + { + "epoch": 2.015265233430861, + "grad_norm": 1.549543023109436, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8860448598861694, + "num_tokens": 604342106.0, + "step": 15842 + }, + { + "epoch": 2.0153924437094517, + "grad_norm": 1.5889590978622437, + "learning_rate": 1e-06, + "loss": 0.2656, + "mean_token_accuracy": 0.9031294584274292, + "num_tokens": 604376231.0, + "step": 15843 + }, + { + "epoch": 2.0155196539880422, + "grad_norm": 1.6492815017700195, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8887951374053955, + "num_tokens": 604408457.0, + "step": 15844 + }, + { + "epoch": 2.0156468642666328, + "grad_norm": 1.5511852502822876, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8738675117492676, + "num_tokens": 604448758.0, + "step": 15845 + }, + { + "epoch": 2.0157740745452233, + "grad_norm": 1.5441524982452393, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8885916471481323, + "num_tokens": 604486755.0, + "step": 15846 + }, + { + "epoch": 2.015901284823814, + "grad_norm": 1.7456097602844238, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8882966041564941, + "num_tokens": 604515792.0, + "step": 15847 + }, + { + "epoch": 2.0160284951024043, + "grad_norm": 1.5764014720916748, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8945102691650391, + "num_tokens": 604553058.0, + "step": 15848 + }, + { + "epoch": 2.016155705380995, + "grad_norm": 1.5163402557373047, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.884375274181366, + "num_tokens": 604592327.0, + "step": 15849 + }, + { + "epoch": 2.0162829156595854, + "grad_norm": 1.6717314720153809, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8873893022537231, + "num_tokens": 604624385.0, + "step": 15850 + }, + { + "epoch": 2.016410125938176, + "grad_norm": 1.4849334955215454, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8854167461395264, + "num_tokens": 604665620.0, + "step": 15851 + }, + { + "epoch": 2.0165373362167665, + "grad_norm": 1.5685410499572754, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8779237270355225, + "num_tokens": 604704073.0, + "step": 15852 + }, + { + "epoch": 2.016664546495357, + "grad_norm": 1.4578442573547363, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8761136531829834, + "num_tokens": 604748696.0, + "step": 15853 + }, + { + "epoch": 2.0167917567739475, + "grad_norm": 1.6610791683197021, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8834323883056641, + "num_tokens": 604781994.0, + "step": 15854 + }, + { + "epoch": 2.016918967052538, + "grad_norm": 1.6954777240753174, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8736044764518738, + "num_tokens": 604819520.0, + "step": 15855 + }, + { + "epoch": 2.0170461773311286, + "grad_norm": 1.7265980243682861, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8817662000656128, + "num_tokens": 604854846.0, + "step": 15856 + }, + { + "epoch": 2.0171733876097186, + "grad_norm": 1.577173113822937, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8722481727600098, + "num_tokens": 604893989.0, + "step": 15857 + }, + { + "epoch": 2.017300597888309, + "grad_norm": 1.5405433177947998, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8860309720039368, + "num_tokens": 604932006.0, + "step": 15858 + }, + { + "epoch": 2.0174278081668997, + "grad_norm": 1.4902437925338745, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8912363052368164, + "num_tokens": 604972015.0, + "step": 15859 + }, + { + "epoch": 2.0175550184454902, + "grad_norm": 1.4709850549697876, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.890903890132904, + "num_tokens": 605012127.0, + "step": 15860 + }, + { + "epoch": 2.0176822287240808, + "grad_norm": 1.4034687280654907, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8930049538612366, + "num_tokens": 605052896.0, + "step": 15861 + }, + { + "epoch": 2.0178094390026713, + "grad_norm": 1.3986026048660278, + "learning_rate": 1e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.894629955291748, + "num_tokens": 605095881.0, + "step": 15862 + }, + { + "epoch": 2.017936649281262, + "grad_norm": 1.5841761827468872, + "learning_rate": 1e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.8993203043937683, + "num_tokens": 605126714.0, + "step": 15863 + }, + { + "epoch": 2.0180638595598523, + "grad_norm": 1.5889023542404175, + "learning_rate": 1e-06, + "loss": 0.2828, + "mean_token_accuracy": 0.8962207436561584, + "num_tokens": 605161819.0, + "step": 15864 + }, + { + "epoch": 2.018191069838443, + "grad_norm": 1.550325632095337, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8662999272346497, + "num_tokens": 605202611.0, + "step": 15865 + }, + { + "epoch": 2.0183182801170334, + "grad_norm": 1.5516574382781982, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8893070816993713, + "num_tokens": 605238316.0, + "step": 15866 + }, + { + "epoch": 2.018445490395624, + "grad_norm": 1.5852614641189575, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8917484879493713, + "num_tokens": 605274724.0, + "step": 15867 + }, + { + "epoch": 2.0185727006742145, + "grad_norm": 1.5531331300735474, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8745001554489136, + "num_tokens": 605312969.0, + "step": 15868 + }, + { + "epoch": 2.018699910952805, + "grad_norm": 1.7746273279190063, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8817865252494812, + "num_tokens": 605345481.0, + "step": 15869 + }, + { + "epoch": 2.0188271212313955, + "grad_norm": 1.5562589168548584, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8844057321548462, + "num_tokens": 605382767.0, + "step": 15870 + }, + { + "epoch": 2.018954331509986, + "grad_norm": 1.547760248184204, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8855104446411133, + "num_tokens": 605424451.0, + "step": 15871 + }, + { + "epoch": 2.0190815417885766, + "grad_norm": 1.6755483150482178, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8939386606216431, + "num_tokens": 605455243.0, + "step": 15872 + }, + { + "epoch": 2.019208752067167, + "grad_norm": 1.6596088409423828, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8943349123001099, + "num_tokens": 605489326.0, + "step": 15873 + }, + { + "epoch": 2.0193359623457576, + "grad_norm": 1.5965189933776855, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8824014663696289, + "num_tokens": 605526836.0, + "step": 15874 + }, + { + "epoch": 2.019463172624348, + "grad_norm": 1.6869380474090576, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8738358020782471, + "num_tokens": 605564924.0, + "step": 15875 + }, + { + "epoch": 2.0195903829029387, + "grad_norm": 1.6228444576263428, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8947359323501587, + "num_tokens": 605599345.0, + "step": 15876 + }, + { + "epoch": 2.019717593181529, + "grad_norm": 1.4676884412765503, + "learning_rate": 1e-06, + "loss": 0.2786, + "mean_token_accuracy": 0.8986101150512695, + "num_tokens": 605636981.0, + "step": 15877 + }, + { + "epoch": 2.0198448034601197, + "grad_norm": 1.5195435285568237, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8906245827674866, + "num_tokens": 605672740.0, + "step": 15878 + }, + { + "epoch": 2.0199720137387103, + "grad_norm": 1.4796156883239746, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.896033763885498, + "num_tokens": 605710525.0, + "step": 15879 + }, + { + "epoch": 2.020099224017301, + "grad_norm": 1.5212671756744385, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8947663307189941, + "num_tokens": 605748305.0, + "step": 15880 + }, + { + "epoch": 2.0202264342958913, + "grad_norm": 1.4531807899475098, + "learning_rate": 1e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.9006599187850952, + "num_tokens": 605784604.0, + "step": 15881 + }, + { + "epoch": 2.0203536445744814, + "grad_norm": 1.346958875656128, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8904839754104614, + "num_tokens": 605830908.0, + "step": 15882 + }, + { + "epoch": 2.020480854853072, + "grad_norm": 1.5538694858551025, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8839808106422424, + "num_tokens": 605869634.0, + "step": 15883 + }, + { + "epoch": 2.0206080651316625, + "grad_norm": 1.6092005968093872, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8814601898193359, + "num_tokens": 605906061.0, + "step": 15884 + }, + { + "epoch": 2.020735275410253, + "grad_norm": 1.5647746324539185, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8930011987686157, + "num_tokens": 605942494.0, + "step": 15885 + }, + { + "epoch": 2.0208624856888435, + "grad_norm": 1.5885335206985474, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8885329961776733, + "num_tokens": 605977442.0, + "step": 15886 + }, + { + "epoch": 2.020989695967434, + "grad_norm": 1.592564582824707, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8856599926948547, + "num_tokens": 606014018.0, + "step": 15887 + }, + { + "epoch": 2.0211169062460246, + "grad_norm": 1.580288052558899, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8861817121505737, + "num_tokens": 606050829.0, + "step": 15888 + }, + { + "epoch": 2.021244116524615, + "grad_norm": 1.4956032037734985, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8827588558197021, + "num_tokens": 606095464.0, + "step": 15889 + }, + { + "epoch": 2.0213713268032056, + "grad_norm": 1.555177092552185, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8888624310493469, + "num_tokens": 606134091.0, + "step": 15890 + }, + { + "epoch": 2.021498537081796, + "grad_norm": 1.6262246370315552, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8860610723495483, + "num_tokens": 606169395.0, + "step": 15891 + }, + { + "epoch": 2.0216257473603867, + "grad_norm": 1.6386947631835938, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.890656590461731, + "num_tokens": 606204876.0, + "step": 15892 + }, + { + "epoch": 2.021752957638977, + "grad_norm": 1.5321331024169922, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8909873962402344, + "num_tokens": 606241547.0, + "step": 15893 + }, + { + "epoch": 2.0218801679175677, + "grad_norm": 1.565138816833496, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8875609040260315, + "num_tokens": 606281488.0, + "step": 15894 + }, + { + "epoch": 2.0220073781961583, + "grad_norm": 1.5384501218795776, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8877441883087158, + "num_tokens": 606318036.0, + "step": 15895 + }, + { + "epoch": 2.022134588474749, + "grad_norm": 1.6049498319625854, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8851965069770813, + "num_tokens": 606356936.0, + "step": 15896 + }, + { + "epoch": 2.0222617987533393, + "grad_norm": 1.6790741682052612, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.884670615196228, + "num_tokens": 606391745.0, + "step": 15897 + }, + { + "epoch": 2.02238900903193, + "grad_norm": 1.5929571390151978, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8882507681846619, + "num_tokens": 606427699.0, + "step": 15898 + }, + { + "epoch": 2.0225162193105204, + "grad_norm": 1.5760691165924072, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8886342644691467, + "num_tokens": 606464095.0, + "step": 15899 + }, + { + "epoch": 2.022643429589111, + "grad_norm": 1.536616563796997, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8802880644798279, + "num_tokens": 606505002.0, + "step": 15900 + }, + { + "epoch": 2.0227706398677014, + "grad_norm": 1.5715038776397705, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8919467926025391, + "num_tokens": 606542283.0, + "step": 15901 + }, + { + "epoch": 2.022897850146292, + "grad_norm": 1.6350517272949219, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8848408460617065, + "num_tokens": 606577101.0, + "step": 15902 + }, + { + "epoch": 2.0230250604248825, + "grad_norm": 1.5430517196655273, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8927823305130005, + "num_tokens": 606615188.0, + "step": 15903 + }, + { + "epoch": 2.023152270703473, + "grad_norm": 1.5246416330337524, + "learning_rate": 1e-06, + "loss": 0.2753, + "mean_token_accuracy": 0.8984545469284058, + "num_tokens": 606652857.0, + "step": 15904 + }, + { + "epoch": 2.0232794809820636, + "grad_norm": 1.640010118484497, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8748917579650879, + "num_tokens": 606698333.0, + "step": 15905 + }, + { + "epoch": 2.0234066912606536, + "grad_norm": 1.4194855690002441, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8763591647148132, + "num_tokens": 606744773.0, + "step": 15906 + }, + { + "epoch": 2.023533901539244, + "grad_norm": 1.7051012516021729, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8785613775253296, + "num_tokens": 606778618.0, + "step": 15907 + }, + { + "epoch": 2.0236611118178347, + "grad_norm": 1.5841137170791626, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8880312442779541, + "num_tokens": 606814486.0, + "step": 15908 + }, + { + "epoch": 2.0237883220964252, + "grad_norm": 1.759681224822998, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8790534734725952, + "num_tokens": 606854608.0, + "step": 15909 + }, + { + "epoch": 2.0239155323750158, + "grad_norm": 1.5518889427185059, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8832496404647827, + "num_tokens": 606893240.0, + "step": 15910 + }, + { + "epoch": 2.0240427426536063, + "grad_norm": 1.5591340065002441, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8748652935028076, + "num_tokens": 606935185.0, + "step": 15911 + }, + { + "epoch": 2.024169952932197, + "grad_norm": 1.3837460279464722, + "learning_rate": 1e-06, + "loss": 0.2559, + "mean_token_accuracy": 0.9072152376174927, + "num_tokens": 606976327.0, + "step": 15912 + }, + { + "epoch": 2.0242971632107873, + "grad_norm": 1.570304274559021, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8862292766571045, + "num_tokens": 607011459.0, + "step": 15913 + }, + { + "epoch": 2.024424373489378, + "grad_norm": 1.5185129642486572, + "learning_rate": 1e-06, + "loss": 0.287, + "mean_token_accuracy": 0.896125078201294, + "num_tokens": 607049630.0, + "step": 15914 + }, + { + "epoch": 2.0245515837679684, + "grad_norm": 1.5066832304000854, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8878880739212036, + "num_tokens": 607088763.0, + "step": 15915 + }, + { + "epoch": 2.024678794046559, + "grad_norm": 1.6625534296035767, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.893193781375885, + "num_tokens": 607127652.0, + "step": 15916 + }, + { + "epoch": 2.0248060043251495, + "grad_norm": 1.5174039602279663, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.883776068687439, + "num_tokens": 607164552.0, + "step": 15917 + }, + { + "epoch": 2.02493321460374, + "grad_norm": 1.5323352813720703, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8864482641220093, + "num_tokens": 607206607.0, + "step": 15918 + }, + { + "epoch": 2.0250604248823305, + "grad_norm": 1.5452066659927368, + "learning_rate": 1e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.8983578681945801, + "num_tokens": 607240931.0, + "step": 15919 + }, + { + "epoch": 2.025187635160921, + "grad_norm": 1.5611827373504639, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8906046152114868, + "num_tokens": 607277824.0, + "step": 15920 + }, + { + "epoch": 2.0253148454395116, + "grad_norm": 1.5197436809539795, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.868579089641571, + "num_tokens": 607319933.0, + "step": 15921 + }, + { + "epoch": 2.025442055718102, + "grad_norm": 1.639374852180481, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8905807733535767, + "num_tokens": 607354480.0, + "step": 15922 + }, + { + "epoch": 2.0255692659966926, + "grad_norm": 1.4512256383895874, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8884446620941162, + "num_tokens": 607395435.0, + "step": 15923 + }, + { + "epoch": 2.025696476275283, + "grad_norm": 1.5411392450332642, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.871076226234436, + "num_tokens": 607434479.0, + "step": 15924 + }, + { + "epoch": 2.0258236865538737, + "grad_norm": 1.5198776721954346, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8711985945701599, + "num_tokens": 607476078.0, + "step": 15925 + }, + { + "epoch": 2.025950896832464, + "grad_norm": 1.5564862489700317, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8710170984268188, + "num_tokens": 607514821.0, + "step": 15926 + }, + { + "epoch": 2.0260781071110547, + "grad_norm": 1.588044285774231, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8784730434417725, + "num_tokens": 607551408.0, + "step": 15927 + }, + { + "epoch": 2.0262053173896453, + "grad_norm": 1.3699158430099487, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8914397954940796, + "num_tokens": 607593256.0, + "step": 15928 + }, + { + "epoch": 2.026332527668236, + "grad_norm": 1.5375117063522339, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.876770555973053, + "num_tokens": 607636271.0, + "step": 15929 + }, + { + "epoch": 2.0264597379468263, + "grad_norm": 1.5835472345352173, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8751660585403442, + "num_tokens": 607673639.0, + "step": 15930 + }, + { + "epoch": 2.0265869482254164, + "grad_norm": 1.4753518104553223, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8728769421577454, + "num_tokens": 607715104.0, + "step": 15931 + }, + { + "epoch": 2.026714158504007, + "grad_norm": 1.424548625946045, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8850380182266235, + "num_tokens": 607759478.0, + "step": 15932 + }, + { + "epoch": 2.0268413687825975, + "grad_norm": 1.5836313962936401, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8928077816963196, + "num_tokens": 607801427.0, + "step": 15933 + }, + { + "epoch": 2.026968579061188, + "grad_norm": 1.5626373291015625, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8946177959442139, + "num_tokens": 607837981.0, + "step": 15934 + }, + { + "epoch": 2.0270957893397785, + "grad_norm": 1.620758056640625, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8876234292984009, + "num_tokens": 607872202.0, + "step": 15935 + }, + { + "epoch": 2.027222999618369, + "grad_norm": 1.535805583000183, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8720134496688843, + "num_tokens": 607912832.0, + "step": 15936 + }, + { + "epoch": 2.0273502098969596, + "grad_norm": 1.4010980129241943, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8785222172737122, + "num_tokens": 607956650.0, + "step": 15937 + }, + { + "epoch": 2.02747742017555, + "grad_norm": 1.4731396436691284, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.889074444770813, + "num_tokens": 607996319.0, + "step": 15938 + }, + { + "epoch": 2.0276046304541406, + "grad_norm": 1.5571168661117554, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8989130854606628, + "num_tokens": 608033019.0, + "step": 15939 + }, + { + "epoch": 2.027731840732731, + "grad_norm": 1.4874080419540405, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8898975253105164, + "num_tokens": 608073546.0, + "step": 15940 + }, + { + "epoch": 2.0278590510113217, + "grad_norm": 1.745863914489746, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8714843988418579, + "num_tokens": 608113565.0, + "step": 15941 + }, + { + "epoch": 2.027986261289912, + "grad_norm": 1.5132490396499634, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8917757272720337, + "num_tokens": 608152229.0, + "step": 15942 + }, + { + "epoch": 2.0281134715685027, + "grad_norm": 1.4757094383239746, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8941025733947754, + "num_tokens": 608190538.0, + "step": 15943 + }, + { + "epoch": 2.0282406818470933, + "grad_norm": 1.6828306913375854, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8836929798126221, + "num_tokens": 608224630.0, + "step": 15944 + }, + { + "epoch": 2.028367892125684, + "grad_norm": 1.4729727506637573, + "learning_rate": 1e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.8958935737609863, + "num_tokens": 608264604.0, + "step": 15945 + }, + { + "epoch": 2.0284951024042743, + "grad_norm": 1.5792322158813477, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8867931365966797, + "num_tokens": 608304528.0, + "step": 15946 + }, + { + "epoch": 2.028622312682865, + "grad_norm": 1.4729217290878296, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8868850469589233, + "num_tokens": 608344815.0, + "step": 15947 + }, + { + "epoch": 2.0287495229614554, + "grad_norm": 1.4894322156906128, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8844338655471802, + "num_tokens": 608388385.0, + "step": 15948 + }, + { + "epoch": 2.028876733240046, + "grad_norm": 1.5319106578826904, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8744450807571411, + "num_tokens": 608427548.0, + "step": 15949 + }, + { + "epoch": 2.0290039435186364, + "grad_norm": 1.4300605058670044, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.881916880607605, + "num_tokens": 608471385.0, + "step": 15950 + }, + { + "epoch": 2.029131153797227, + "grad_norm": 1.6318230628967285, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8734195232391357, + "num_tokens": 608509599.0, + "step": 15951 + }, + { + "epoch": 2.0292583640758175, + "grad_norm": 1.5925978422164917, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.888350248336792, + "num_tokens": 608546685.0, + "step": 15952 + }, + { + "epoch": 2.029385574354408, + "grad_norm": 1.4860025644302368, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8829805850982666, + "num_tokens": 608589141.0, + "step": 15953 + }, + { + "epoch": 2.0295127846329986, + "grad_norm": 1.5693789720535278, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8894444704055786, + "num_tokens": 608626671.0, + "step": 15954 + }, + { + "epoch": 2.0296399949115886, + "grad_norm": 1.455507755279541, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8713183403015137, + "num_tokens": 608670695.0, + "step": 15955 + }, + { + "epoch": 2.029767205190179, + "grad_norm": 1.6788891553878784, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8872830271720886, + "num_tokens": 608703424.0, + "step": 15956 + }, + { + "epoch": 2.0298944154687697, + "grad_norm": 1.4888923168182373, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8923668265342712, + "num_tokens": 608741652.0, + "step": 15957 + }, + { + "epoch": 2.0300216257473602, + "grad_norm": 1.5713653564453125, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8821292519569397, + "num_tokens": 608780990.0, + "step": 15958 + }, + { + "epoch": 2.0301488360259508, + "grad_norm": 1.527607798576355, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8852322697639465, + "num_tokens": 608822604.0, + "step": 15959 + }, + { + "epoch": 2.0302760463045413, + "grad_norm": 1.4127205610275269, + "learning_rate": 1e-06, + "loss": 0.2492, + "mean_token_accuracy": 0.9070867896080017, + "num_tokens": 608861846.0, + "step": 15960 + }, + { + "epoch": 2.030403256583132, + "grad_norm": 1.5368261337280273, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8751806020736694, + "num_tokens": 608902731.0, + "step": 15961 + }, + { + "epoch": 2.0305304668617223, + "grad_norm": 1.5084080696105957, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8892199397087097, + "num_tokens": 608941147.0, + "step": 15962 + }, + { + "epoch": 2.030657677140313, + "grad_norm": 1.5409648418426514, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8893334865570068, + "num_tokens": 608984058.0, + "step": 15963 + }, + { + "epoch": 2.0307848874189034, + "grad_norm": 1.5686825513839722, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.891355037689209, + "num_tokens": 609022947.0, + "step": 15964 + }, + { + "epoch": 2.030912097697494, + "grad_norm": 1.7175886631011963, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8814965486526489, + "num_tokens": 609061324.0, + "step": 15965 + }, + { + "epoch": 2.0310393079760845, + "grad_norm": 1.4700149297714233, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8932355642318726, + "num_tokens": 609100664.0, + "step": 15966 + }, + { + "epoch": 2.031166518254675, + "grad_norm": 1.421566128730774, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8928878903388977, + "num_tokens": 609144429.0, + "step": 15967 + }, + { + "epoch": 2.0312937285332655, + "grad_norm": 1.5687696933746338, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8769206404685974, + "num_tokens": 609184559.0, + "step": 15968 + }, + { + "epoch": 2.031420938811856, + "grad_norm": 1.670698642730713, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8850093483924866, + "num_tokens": 609216970.0, + "step": 15969 + }, + { + "epoch": 2.0315481490904466, + "grad_norm": 1.386863350868225, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8938040137290955, + "num_tokens": 609259148.0, + "step": 15970 + }, + { + "epoch": 2.031675359369037, + "grad_norm": 1.4643816947937012, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8918002843856812, + "num_tokens": 609296589.0, + "step": 15971 + }, + { + "epoch": 2.0318025696476276, + "grad_norm": 1.466942310333252, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8927097916603088, + "num_tokens": 609337787.0, + "step": 15972 + }, + { + "epoch": 2.031929779926218, + "grad_norm": 1.6638330221176147, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8783707618713379, + "num_tokens": 609373192.0, + "step": 15973 + }, + { + "epoch": 2.0320569902048087, + "grad_norm": 1.5799890756607056, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8779271245002747, + "num_tokens": 609410438.0, + "step": 15974 + }, + { + "epoch": 2.032184200483399, + "grad_norm": 1.544736385345459, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8851044178009033, + "num_tokens": 609448686.0, + "step": 15975 + }, + { + "epoch": 2.0323114107619897, + "grad_norm": 1.6017920970916748, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8931275606155396, + "num_tokens": 609482130.0, + "step": 15976 + }, + { + "epoch": 2.0324386210405803, + "grad_norm": 1.6970269680023193, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8891119360923767, + "num_tokens": 609513044.0, + "step": 15977 + }, + { + "epoch": 2.032565831319171, + "grad_norm": 1.4339005947113037, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8814862966537476, + "num_tokens": 609553112.0, + "step": 15978 + }, + { + "epoch": 2.032693041597761, + "grad_norm": 1.6443780660629272, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8630547523498535, + "num_tokens": 609592039.0, + "step": 15979 + }, + { + "epoch": 2.0328202518763514, + "grad_norm": 1.619828462600708, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8847178220748901, + "num_tokens": 609626848.0, + "step": 15980 + }, + { + "epoch": 2.032947462154942, + "grad_norm": 1.5940107107162476, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8859592080116272, + "num_tokens": 609661626.0, + "step": 15981 + }, + { + "epoch": 2.0330746724335325, + "grad_norm": 1.467305302619934, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8722887635231018, + "num_tokens": 609706434.0, + "step": 15982 + }, + { + "epoch": 2.033201882712123, + "grad_norm": 1.6889435052871704, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8868677616119385, + "num_tokens": 609741244.0, + "step": 15983 + }, + { + "epoch": 2.0333290929907135, + "grad_norm": 1.630086898803711, + "learning_rate": 1e-06, + "loss": 0.2702, + "mean_token_accuracy": 0.8970421552658081, + "num_tokens": 609773798.0, + "step": 15984 + }, + { + "epoch": 2.033456303269304, + "grad_norm": 1.4171881675720215, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8851436376571655, + "num_tokens": 609818535.0, + "step": 15985 + }, + { + "epoch": 2.0335835135478946, + "grad_norm": 1.3598366975784302, + "learning_rate": 1e-06, + "loss": 0.2733, + "mean_token_accuracy": 0.9009982943534851, + "num_tokens": 609861366.0, + "step": 15986 + }, + { + "epoch": 2.033710723826485, + "grad_norm": 1.5122374296188354, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8720784187316895, + "num_tokens": 609904392.0, + "step": 15987 + }, + { + "epoch": 2.0338379341050756, + "grad_norm": 1.6117157936096191, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8801532983779907, + "num_tokens": 609941579.0, + "step": 15988 + }, + { + "epoch": 2.033965144383666, + "grad_norm": 1.5007822513580322, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8878796696662903, + "num_tokens": 609982183.0, + "step": 15989 + }, + { + "epoch": 2.0340923546622567, + "grad_norm": 1.4099055528640747, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8897338509559631, + "num_tokens": 610025790.0, + "step": 15990 + }, + { + "epoch": 2.034219564940847, + "grad_norm": 1.5047903060913086, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8912066221237183, + "num_tokens": 610069220.0, + "step": 15991 + }, + { + "epoch": 2.0343467752194377, + "grad_norm": 1.8017873764038086, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8718885183334351, + "num_tokens": 610101730.0, + "step": 15992 + }, + { + "epoch": 2.0344739854980283, + "grad_norm": 1.50482177734375, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8720574975013733, + "num_tokens": 610144013.0, + "step": 15993 + }, + { + "epoch": 2.034601195776619, + "grad_norm": 1.558243989944458, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8761098384857178, + "num_tokens": 610183880.0, + "step": 15994 + }, + { + "epoch": 2.0347284060552093, + "grad_norm": 1.5704975128173828, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8777443170547485, + "num_tokens": 610222087.0, + "step": 15995 + }, + { + "epoch": 2.0348556163338, + "grad_norm": 1.61928391456604, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.876039981842041, + "num_tokens": 610257983.0, + "step": 15996 + }, + { + "epoch": 2.0349828266123904, + "grad_norm": 1.3895199298858643, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8925113081932068, + "num_tokens": 610301401.0, + "step": 15997 + }, + { + "epoch": 2.035110036890981, + "grad_norm": 1.4359469413757324, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8882867097854614, + "num_tokens": 610343352.0, + "step": 15998 + }, + { + "epoch": 2.0352372471695714, + "grad_norm": 1.671750545501709, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8944246768951416, + "num_tokens": 610374699.0, + "step": 15999 + }, + { + "epoch": 2.035364457448162, + "grad_norm": 1.5236419439315796, + "learning_rate": 1e-06, + "loss": 0.2595, + "mean_token_accuracy": 0.9046504497528076, + "num_tokens": 610407591.0, + "step": 16000 + }, + { + "epoch": 2.0354916677267525, + "grad_norm": 1.4838899374008179, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8935883045196533, + "num_tokens": 610446017.0, + "step": 16001 + }, + { + "epoch": 2.035618878005343, + "grad_norm": 1.5978842973709106, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8817064762115479, + "num_tokens": 610483867.0, + "step": 16002 + }, + { + "epoch": 2.0357460882839336, + "grad_norm": 2.1386160850524902, + "learning_rate": 1e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.900162935256958, + "num_tokens": 610516573.0, + "step": 16003 + }, + { + "epoch": 2.0358732985625236, + "grad_norm": 1.6862927675247192, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8874236345291138, + "num_tokens": 610552061.0, + "step": 16004 + }, + { + "epoch": 2.036000508841114, + "grad_norm": 1.6617703437805176, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8779609203338623, + "num_tokens": 610585675.0, + "step": 16005 + }, + { + "epoch": 2.0361277191197047, + "grad_norm": 1.6170294284820557, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8907102346420288, + "num_tokens": 610617750.0, + "step": 16006 + }, + { + "epoch": 2.036254929398295, + "grad_norm": 1.649165391921997, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8860365152359009, + "num_tokens": 610652083.0, + "step": 16007 + }, + { + "epoch": 2.0363821396768857, + "grad_norm": 1.7128514051437378, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8774731755256653, + "num_tokens": 610685625.0, + "step": 16008 + }, + { + "epoch": 2.0365093499554763, + "grad_norm": 1.5691267251968384, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8864058256149292, + "num_tokens": 610727368.0, + "step": 16009 + }, + { + "epoch": 2.036636560234067, + "grad_norm": 1.5808570384979248, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8916423320770264, + "num_tokens": 610763713.0, + "step": 16010 + }, + { + "epoch": 2.0367637705126573, + "grad_norm": 1.4455684423446655, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8936845660209656, + "num_tokens": 610805556.0, + "step": 16011 + }, + { + "epoch": 2.036890980791248, + "grad_norm": 1.4267786741256714, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8916773200035095, + "num_tokens": 610850143.0, + "step": 16012 + }, + { + "epoch": 2.0370181910698384, + "grad_norm": 1.5072234869003296, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8825039863586426, + "num_tokens": 610890634.0, + "step": 16013 + }, + { + "epoch": 2.037145401348429, + "grad_norm": 1.5879918336868286, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8856460452079773, + "num_tokens": 610926688.0, + "step": 16014 + }, + { + "epoch": 2.0372726116270194, + "grad_norm": 1.629556655883789, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8746256232261658, + "num_tokens": 610962182.0, + "step": 16015 + }, + { + "epoch": 2.03739982190561, + "grad_norm": 1.4880205392837524, + "learning_rate": 1e-06, + "loss": 0.2727, + "mean_token_accuracy": 0.8999961614608765, + "num_tokens": 611000718.0, + "step": 16016 + }, + { + "epoch": 2.0375270321842005, + "grad_norm": 1.4303725957870483, + "learning_rate": 1e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.8935943841934204, + "num_tokens": 611041816.0, + "step": 16017 + }, + { + "epoch": 2.037654242462791, + "grad_norm": 1.6560707092285156, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8819288015365601, + "num_tokens": 611075945.0, + "step": 16018 + }, + { + "epoch": 2.0377814527413816, + "grad_norm": 1.5485179424285889, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8840871453285217, + "num_tokens": 611112077.0, + "step": 16019 + }, + { + "epoch": 2.037908663019972, + "grad_norm": 1.5334004163742065, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8851585984230042, + "num_tokens": 611150099.0, + "step": 16020 + }, + { + "epoch": 2.0380358732985626, + "grad_norm": 1.5318002700805664, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8813095092773438, + "num_tokens": 611190559.0, + "step": 16021 + }, + { + "epoch": 2.038163083577153, + "grad_norm": 1.6030235290527344, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8809086084365845, + "num_tokens": 611227136.0, + "step": 16022 + }, + { + "epoch": 2.0382902938557437, + "grad_norm": 1.4796448945999146, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8958530426025391, + "num_tokens": 611266559.0, + "step": 16023 + }, + { + "epoch": 2.038417504134334, + "grad_norm": 1.4424768686294556, + "learning_rate": 1e-06, + "loss": 0.2681, + "mean_token_accuracy": 0.9038712978363037, + "num_tokens": 611306618.0, + "step": 16024 + }, + { + "epoch": 2.0385447144129247, + "grad_norm": 1.4715628623962402, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8891436457633972, + "num_tokens": 611347587.0, + "step": 16025 + }, + { + "epoch": 2.0386719246915153, + "grad_norm": 1.5397555828094482, + "learning_rate": 1e-06, + "loss": 0.2734, + "mean_token_accuracy": 0.8996680974960327, + "num_tokens": 611382640.0, + "step": 16026 + }, + { + "epoch": 2.038799134970106, + "grad_norm": 1.4823662042617798, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8892289400100708, + "num_tokens": 611425113.0, + "step": 16027 + }, + { + "epoch": 2.0389263452486963, + "grad_norm": 1.7328684329986572, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8809397220611572, + "num_tokens": 611458391.0, + "step": 16028 + }, + { + "epoch": 2.0390535555272864, + "grad_norm": 1.7057900428771973, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.862619936466217, + "num_tokens": 611496599.0, + "step": 16029 + }, + { + "epoch": 2.039180765805877, + "grad_norm": 1.509139060974121, + "learning_rate": 1e-06, + "loss": 0.2789, + "mean_token_accuracy": 0.8983659744262695, + "num_tokens": 611533015.0, + "step": 16030 + }, + { + "epoch": 2.0393079760844675, + "grad_norm": 1.5777570009231567, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8906129598617554, + "num_tokens": 611570041.0, + "step": 16031 + }, + { + "epoch": 2.039435186363058, + "grad_norm": 1.4645626544952393, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8750962018966675, + "num_tokens": 611613598.0, + "step": 16032 + }, + { + "epoch": 2.0395623966416485, + "grad_norm": 1.6777677536010742, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8935174942016602, + "num_tokens": 611641705.0, + "step": 16033 + }, + { + "epoch": 2.039689606920239, + "grad_norm": 1.5263925790786743, + "learning_rate": 1e-06, + "loss": 0.2853, + "mean_token_accuracy": 0.8969144821166992, + "num_tokens": 611677856.0, + "step": 16034 + }, + { + "epoch": 2.0398168171988296, + "grad_norm": 1.6035799980163574, + "learning_rate": 1e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.8975379467010498, + "num_tokens": 611711479.0, + "step": 16035 + }, + { + "epoch": 2.03994402747742, + "grad_norm": 1.5257322788238525, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8787838220596313, + "num_tokens": 611751500.0, + "step": 16036 + }, + { + "epoch": 2.0400712377560106, + "grad_norm": 1.5188274383544922, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8738763332366943, + "num_tokens": 611794887.0, + "step": 16037 + }, + { + "epoch": 2.040198448034601, + "grad_norm": 1.5293023586273193, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8886829018592834, + "num_tokens": 611832496.0, + "step": 16038 + }, + { + "epoch": 2.0403256583131917, + "grad_norm": 1.6239792108535767, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8722752928733826, + "num_tokens": 611875123.0, + "step": 16039 + }, + { + "epoch": 2.040452868591782, + "grad_norm": 1.4003795385360718, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8919160962104797, + "num_tokens": 611915402.0, + "step": 16040 + }, + { + "epoch": 2.0405800788703727, + "grad_norm": 1.596142292022705, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8912482857704163, + "num_tokens": 611948766.0, + "step": 16041 + }, + { + "epoch": 2.0407072891489633, + "grad_norm": 1.6681256294250488, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8923062682151794, + "num_tokens": 611981947.0, + "step": 16042 + }, + { + "epoch": 2.040834499427554, + "grad_norm": 1.3781614303588867, + "learning_rate": 1e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.8962670564651489, + "num_tokens": 612024311.0, + "step": 16043 + }, + { + "epoch": 2.0409617097061443, + "grad_norm": 1.404642105102539, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8960084319114685, + "num_tokens": 612066257.0, + "step": 16044 + }, + { + "epoch": 2.041088919984735, + "grad_norm": 1.5417383909225464, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8855529427528381, + "num_tokens": 612104242.0, + "step": 16045 + }, + { + "epoch": 2.0412161302633254, + "grad_norm": 1.564024567604065, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8875731229782104, + "num_tokens": 612141238.0, + "step": 16046 + }, + { + "epoch": 2.041343340541916, + "grad_norm": 1.5036332607269287, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8891960978507996, + "num_tokens": 612177721.0, + "step": 16047 + }, + { + "epoch": 2.0414705508205064, + "grad_norm": 1.717463731765747, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8868021368980408, + "num_tokens": 612212224.0, + "step": 16048 + }, + { + "epoch": 2.041597761099097, + "grad_norm": 1.4984540939331055, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8856917023658752, + "num_tokens": 612252833.0, + "step": 16049 + }, + { + "epoch": 2.0417249713776875, + "grad_norm": 1.5023664236068726, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8976467251777649, + "num_tokens": 612294871.0, + "step": 16050 + }, + { + "epoch": 2.041852181656278, + "grad_norm": 1.4231609106063843, + "learning_rate": 1e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.8996492028236389, + "num_tokens": 612333326.0, + "step": 16051 + }, + { + "epoch": 2.0419793919348685, + "grad_norm": 1.530004858970642, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8907705545425415, + "num_tokens": 612373729.0, + "step": 16052 + }, + { + "epoch": 2.0421066022134586, + "grad_norm": 1.5947495698928833, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8867951035499573, + "num_tokens": 612408924.0, + "step": 16053 + }, + { + "epoch": 2.042233812492049, + "grad_norm": 1.4733467102050781, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8882892727851868, + "num_tokens": 612449569.0, + "step": 16054 + }, + { + "epoch": 2.0423610227706397, + "grad_norm": 1.4305146932601929, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.8969308733940125, + "num_tokens": 612489677.0, + "step": 16055 + }, + { + "epoch": 2.04248823304923, + "grad_norm": 1.5177818536758423, + "learning_rate": 1e-06, + "loss": 0.2734, + "mean_token_accuracy": 0.8997933268547058, + "num_tokens": 612523746.0, + "step": 16056 + }, + { + "epoch": 2.0426154433278207, + "grad_norm": 1.5399422645568848, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.884067177772522, + "num_tokens": 612562172.0, + "step": 16057 + }, + { + "epoch": 2.0427426536064113, + "grad_norm": 1.8386039733886719, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8875494599342346, + "num_tokens": 612605556.0, + "step": 16058 + }, + { + "epoch": 2.042869863885002, + "grad_norm": 1.5585392713546753, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8849877119064331, + "num_tokens": 612643645.0, + "step": 16059 + }, + { + "epoch": 2.0429970741635923, + "grad_norm": 1.5199910402297974, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8967717885971069, + "num_tokens": 612679360.0, + "step": 16060 + }, + { + "epoch": 2.043124284442183, + "grad_norm": 1.6895503997802734, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8813899755477905, + "num_tokens": 612715770.0, + "step": 16061 + }, + { + "epoch": 2.0432514947207734, + "grad_norm": 1.7390180826187134, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8812516927719116, + "num_tokens": 612752220.0, + "step": 16062 + }, + { + "epoch": 2.043378704999364, + "grad_norm": 1.516365647315979, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8926982879638672, + "num_tokens": 612790360.0, + "step": 16063 + }, + { + "epoch": 2.0435059152779544, + "grad_norm": 1.5258196592330933, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8903219699859619, + "num_tokens": 612827166.0, + "step": 16064 + }, + { + "epoch": 2.043633125556545, + "grad_norm": 1.6668821573257446, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8774995803833008, + "num_tokens": 612862192.0, + "step": 16065 + }, + { + "epoch": 2.0437603358351355, + "grad_norm": 1.5544278621673584, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8953315019607544, + "num_tokens": 612898591.0, + "step": 16066 + }, + { + "epoch": 2.043887546113726, + "grad_norm": 1.8013567924499512, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8754447102546692, + "num_tokens": 612929025.0, + "step": 16067 + }, + { + "epoch": 2.0440147563923166, + "grad_norm": 1.8195918798446655, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8790484666824341, + "num_tokens": 612958077.0, + "step": 16068 + }, + { + "epoch": 2.044141966670907, + "grad_norm": 1.6022650003433228, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.881924569606781, + "num_tokens": 612995230.0, + "step": 16069 + }, + { + "epoch": 2.0442691769494976, + "grad_norm": 1.6187551021575928, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8876248598098755, + "num_tokens": 613028359.0, + "step": 16070 + }, + { + "epoch": 2.044396387228088, + "grad_norm": 1.5676716566085815, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8708260655403137, + "num_tokens": 613069624.0, + "step": 16071 + }, + { + "epoch": 2.0445235975066787, + "grad_norm": 1.6362003087997437, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8808761835098267, + "num_tokens": 613106502.0, + "step": 16072 + }, + { + "epoch": 2.044650807785269, + "grad_norm": 1.5991171598434448, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8897953629493713, + "num_tokens": 613142370.0, + "step": 16073 + }, + { + "epoch": 2.0447780180638597, + "grad_norm": 1.4025287628173828, + "learning_rate": 1e-06, + "loss": 0.2791, + "mean_token_accuracy": 0.8993628621101379, + "num_tokens": 613182255.0, + "step": 16074 + }, + { + "epoch": 2.0449052283424503, + "grad_norm": 1.5135526657104492, + "learning_rate": 1e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.8975036144256592, + "num_tokens": 613221153.0, + "step": 16075 + }, + { + "epoch": 2.045032438621041, + "grad_norm": 1.528011679649353, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8829519152641296, + "num_tokens": 613264952.0, + "step": 16076 + }, + { + "epoch": 2.045159648899631, + "grad_norm": 1.5080899000167847, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8690351247787476, + "num_tokens": 613311309.0, + "step": 16077 + }, + { + "epoch": 2.0452868591782214, + "grad_norm": 1.523298740386963, + "learning_rate": 1e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.899089515209198, + "num_tokens": 613345571.0, + "step": 16078 + }, + { + "epoch": 2.045414069456812, + "grad_norm": 1.7083098888397217, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8819728493690491, + "num_tokens": 613380821.0, + "step": 16079 + }, + { + "epoch": 2.0455412797354025, + "grad_norm": 1.4932606220245361, + "learning_rate": 1e-06, + "loss": 0.278, + "mean_token_accuracy": 0.8996281027793884, + "num_tokens": 613421097.0, + "step": 16080 + }, + { + "epoch": 2.045668490013993, + "grad_norm": 1.6737357378005981, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8785671591758728, + "num_tokens": 613459182.0, + "step": 16081 + }, + { + "epoch": 2.0457957002925835, + "grad_norm": 1.5643094778060913, + "learning_rate": 1e-06, + "loss": 0.2509, + "mean_token_accuracy": 0.9077056646347046, + "num_tokens": 613490960.0, + "step": 16082 + }, + { + "epoch": 2.045922910571174, + "grad_norm": 1.503290057182312, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8852095603942871, + "num_tokens": 613530494.0, + "step": 16083 + }, + { + "epoch": 2.0460501208497646, + "grad_norm": 1.443497657775879, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8850356340408325, + "num_tokens": 613575814.0, + "step": 16084 + }, + { + "epoch": 2.046177331128355, + "grad_norm": 1.518660545349121, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8877056837081909, + "num_tokens": 613615729.0, + "step": 16085 + }, + { + "epoch": 2.0463045414069456, + "grad_norm": 1.5802284479141235, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.887169599533081, + "num_tokens": 613656249.0, + "step": 16086 + }, + { + "epoch": 2.046431751685536, + "grad_norm": 1.6025761365890503, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8821794986724854, + "num_tokens": 613696570.0, + "step": 16087 + }, + { + "epoch": 2.0465589619641267, + "grad_norm": 1.5664006471633911, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8810539245605469, + "num_tokens": 613737873.0, + "step": 16088 + }, + { + "epoch": 2.046686172242717, + "grad_norm": 1.7562892436981201, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8823789358139038, + "num_tokens": 613768247.0, + "step": 16089 + }, + { + "epoch": 2.0468133825213077, + "grad_norm": 1.5921263694763184, + "learning_rate": 1e-06, + "loss": 0.2828, + "mean_token_accuracy": 0.8962198495864868, + "num_tokens": 613802287.0, + "step": 16090 + }, + { + "epoch": 2.0469405927998983, + "grad_norm": 1.6036694049835205, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.892069399356842, + "num_tokens": 613840906.0, + "step": 16091 + }, + { + "epoch": 2.047067803078489, + "grad_norm": 1.7414720058441162, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8746268153190613, + "num_tokens": 613877226.0, + "step": 16092 + }, + { + "epoch": 2.0471950133570793, + "grad_norm": 1.362697720527649, + "learning_rate": 1e-06, + "loss": 0.2815, + "mean_token_accuracy": 0.8999639749526978, + "num_tokens": 613921080.0, + "step": 16093 + }, + { + "epoch": 2.04732222363567, + "grad_norm": 1.5584096908569336, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8893746137619019, + "num_tokens": 613954649.0, + "step": 16094 + }, + { + "epoch": 2.0474494339142604, + "grad_norm": 1.4983762502670288, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8863037824630737, + "num_tokens": 613993393.0, + "step": 16095 + }, + { + "epoch": 2.047576644192851, + "grad_norm": 1.521802544593811, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8834497928619385, + "num_tokens": 614034365.0, + "step": 16096 + }, + { + "epoch": 2.0477038544714414, + "grad_norm": 1.5359312295913696, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8857961297035217, + "num_tokens": 614078040.0, + "step": 16097 + }, + { + "epoch": 2.047831064750032, + "grad_norm": 1.4254424571990967, + "learning_rate": 1e-06, + "loss": 0.2827, + "mean_token_accuracy": 0.899252712726593, + "num_tokens": 614117412.0, + "step": 16098 + }, + { + "epoch": 2.0479582750286225, + "grad_norm": 1.7353636026382446, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8674910068511963, + "num_tokens": 614153913.0, + "step": 16099 + }, + { + "epoch": 2.048085485307213, + "grad_norm": 1.5600286722183228, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8939723372459412, + "num_tokens": 614189834.0, + "step": 16100 + }, + { + "epoch": 2.0482126955858035, + "grad_norm": 1.4988895654678345, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8876692056655884, + "num_tokens": 614229545.0, + "step": 16101 + }, + { + "epoch": 2.0483399058643936, + "grad_norm": 1.4591516256332397, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8906269073486328, + "num_tokens": 614270304.0, + "step": 16102 + }, + { + "epoch": 2.048467116142984, + "grad_norm": 1.5061017274856567, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8957392573356628, + "num_tokens": 614309356.0, + "step": 16103 + }, + { + "epoch": 2.0485943264215747, + "grad_norm": 1.513661503791809, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8816516995429993, + "num_tokens": 614353644.0, + "step": 16104 + }, + { + "epoch": 2.048721536700165, + "grad_norm": 1.5673476457595825, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8720142841339111, + "num_tokens": 614392767.0, + "step": 16105 + }, + { + "epoch": 2.0488487469787557, + "grad_norm": 1.4468306303024292, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8930237293243408, + "num_tokens": 614433873.0, + "step": 16106 + }, + { + "epoch": 2.0489759572573463, + "grad_norm": 1.5290696620941162, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8888666033744812, + "num_tokens": 614471624.0, + "step": 16107 + }, + { + "epoch": 2.049103167535937, + "grad_norm": 1.4223737716674805, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8827098608016968, + "num_tokens": 614515161.0, + "step": 16108 + }, + { + "epoch": 2.0492303778145273, + "grad_norm": 1.457108974456787, + "learning_rate": 1e-06, + "loss": 0.2853, + "mean_token_accuracy": 0.8931100368499756, + "num_tokens": 614555259.0, + "step": 16109 + }, + { + "epoch": 2.049357588093118, + "grad_norm": 1.4998935461044312, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.881685733795166, + "num_tokens": 614598407.0, + "step": 16110 + }, + { + "epoch": 2.0494847983717084, + "grad_norm": 1.5592906475067139, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8861327171325684, + "num_tokens": 614635289.0, + "step": 16111 + }, + { + "epoch": 2.049612008650299, + "grad_norm": 1.4789154529571533, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8827751874923706, + "num_tokens": 614675496.0, + "step": 16112 + }, + { + "epoch": 2.0497392189288894, + "grad_norm": 1.5780194997787476, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8867020010948181, + "num_tokens": 614714904.0, + "step": 16113 + }, + { + "epoch": 2.04986642920748, + "grad_norm": 1.4581385850906372, + "learning_rate": 1e-06, + "loss": 0.287, + "mean_token_accuracy": 0.8962466716766357, + "num_tokens": 614754977.0, + "step": 16114 + }, + { + "epoch": 2.0499936394860705, + "grad_norm": 1.6087497472763062, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8742644190788269, + "num_tokens": 614792801.0, + "step": 16115 + }, + { + "epoch": 2.050120849764661, + "grad_norm": 1.6704659461975098, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8893967866897583, + "num_tokens": 614825274.0, + "step": 16116 + }, + { + "epoch": 2.0502480600432516, + "grad_norm": 1.5656241178512573, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8931372165679932, + "num_tokens": 614865225.0, + "step": 16117 + }, + { + "epoch": 2.050375270321842, + "grad_norm": 1.670021414756775, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8641696572303772, + "num_tokens": 614904783.0, + "step": 16118 + }, + { + "epoch": 2.0505024806004326, + "grad_norm": 1.5878989696502686, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8762539625167847, + "num_tokens": 614942282.0, + "step": 16119 + }, + { + "epoch": 2.050629690879023, + "grad_norm": 1.5036742687225342, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8863146305084229, + "num_tokens": 614981375.0, + "step": 16120 + }, + { + "epoch": 2.0507569011576137, + "grad_norm": 1.5997499227523804, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8752175569534302, + "num_tokens": 615022873.0, + "step": 16121 + }, + { + "epoch": 2.050884111436204, + "grad_norm": 1.5160869359970093, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8816277384757996, + "num_tokens": 615064148.0, + "step": 16122 + }, + { + "epoch": 2.0510113217147947, + "grad_norm": 1.4928679466247559, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8899581432342529, + "num_tokens": 615101881.0, + "step": 16123 + }, + { + "epoch": 2.0511385319933853, + "grad_norm": 1.6215145587921143, + "learning_rate": 1e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.8897395133972168, + "num_tokens": 615133414.0, + "step": 16124 + }, + { + "epoch": 2.051265742271976, + "grad_norm": 1.5146358013153076, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.8939014077186584, + "num_tokens": 615169437.0, + "step": 16125 + }, + { + "epoch": 2.0513929525505663, + "grad_norm": 1.5358561277389526, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8794578909873962, + "num_tokens": 615207736.0, + "step": 16126 + }, + { + "epoch": 2.0515201628291564, + "grad_norm": 1.5915380716323853, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8842455148696899, + "num_tokens": 615246240.0, + "step": 16127 + }, + { + "epoch": 2.051647373107747, + "grad_norm": 1.5541260242462158, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8892680406570435, + "num_tokens": 615284063.0, + "step": 16128 + }, + { + "epoch": 2.0517745833863374, + "grad_norm": 1.4781407117843628, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8763489127159119, + "num_tokens": 615328876.0, + "step": 16129 + }, + { + "epoch": 2.051901793664928, + "grad_norm": 1.603502869606018, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8909244537353516, + "num_tokens": 615362178.0, + "step": 16130 + }, + { + "epoch": 2.0520290039435185, + "grad_norm": 1.6293683052062988, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8807390928268433, + "num_tokens": 615400826.0, + "step": 16131 + }, + { + "epoch": 2.052156214222109, + "grad_norm": 1.6792532205581665, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8791422843933105, + "num_tokens": 615436056.0, + "step": 16132 + }, + { + "epoch": 2.0522834245006996, + "grad_norm": 1.5990989208221436, + "learning_rate": 1e-06, + "loss": 0.266, + "mean_token_accuracy": 0.9022340178489685, + "num_tokens": 615473271.0, + "step": 16133 + }, + { + "epoch": 2.05241063477929, + "grad_norm": 1.5957351922988892, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8784939050674438, + "num_tokens": 615509974.0, + "step": 16134 + }, + { + "epoch": 2.0525378450578806, + "grad_norm": 1.5181981325149536, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8744547963142395, + "num_tokens": 615550321.0, + "step": 16135 + }, + { + "epoch": 2.052665055336471, + "grad_norm": 1.6385194063186646, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8894898891448975, + "num_tokens": 615588447.0, + "step": 16136 + }, + { + "epoch": 2.0527922656150617, + "grad_norm": 1.545667290687561, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8990694284439087, + "num_tokens": 615624124.0, + "step": 16137 + }, + { + "epoch": 2.052919475893652, + "grad_norm": 1.538834571838379, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8884378671646118, + "num_tokens": 615665145.0, + "step": 16138 + }, + { + "epoch": 2.0530466861722427, + "grad_norm": 1.4475505352020264, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8847192525863647, + "num_tokens": 615705697.0, + "step": 16139 + }, + { + "epoch": 2.0531738964508333, + "grad_norm": 1.5163934230804443, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8921771049499512, + "num_tokens": 615742410.0, + "step": 16140 + }, + { + "epoch": 2.053301106729424, + "grad_norm": 1.597609519958496, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8803832530975342, + "num_tokens": 615778670.0, + "step": 16141 + }, + { + "epoch": 2.0534283170080143, + "grad_norm": 1.4743728637695312, + "learning_rate": 1e-06, + "loss": 0.2646, + "mean_token_accuracy": 0.902907133102417, + "num_tokens": 615820005.0, + "step": 16142 + }, + { + "epoch": 2.053555527286605, + "grad_norm": 1.5143601894378662, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8862379789352417, + "num_tokens": 615857808.0, + "step": 16143 + }, + { + "epoch": 2.0536827375651954, + "grad_norm": 1.4868769645690918, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8913995027542114, + "num_tokens": 615897632.0, + "step": 16144 + }, + { + "epoch": 2.053809947843786, + "grad_norm": 1.6958613395690918, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8852003216743469, + "num_tokens": 615931335.0, + "step": 16145 + }, + { + "epoch": 2.0539371581223764, + "grad_norm": 1.4506319761276245, + "learning_rate": 1e-06, + "loss": 0.2726, + "mean_token_accuracy": 0.8986111879348755, + "num_tokens": 615969679.0, + "step": 16146 + }, + { + "epoch": 2.054064368400967, + "grad_norm": 1.5676822662353516, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8867461681365967, + "num_tokens": 616006697.0, + "step": 16147 + }, + { + "epoch": 2.0541915786795575, + "grad_norm": 1.5019090175628662, + "learning_rate": 1e-06, + "loss": 0.2674, + "mean_token_accuracy": 0.9022194147109985, + "num_tokens": 616042174.0, + "step": 16148 + }, + { + "epoch": 2.054318788958148, + "grad_norm": 1.7128018140792847, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8828286528587341, + "num_tokens": 616075575.0, + "step": 16149 + }, + { + "epoch": 2.0544459992367385, + "grad_norm": 1.634534478187561, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8873566389083862, + "num_tokens": 616110421.0, + "step": 16150 + }, + { + "epoch": 2.0545732095153286, + "grad_norm": 1.5480444431304932, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8906976580619812, + "num_tokens": 616146557.0, + "step": 16151 + }, + { + "epoch": 2.054700419793919, + "grad_norm": 1.4168611764907837, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8867371082305908, + "num_tokens": 616188879.0, + "step": 16152 + }, + { + "epoch": 2.0548276300725097, + "grad_norm": 1.551013708114624, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8852971792221069, + "num_tokens": 616226283.0, + "step": 16153 + }, + { + "epoch": 2.0549548403511, + "grad_norm": 1.5204421281814575, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8883660435676575, + "num_tokens": 616262951.0, + "step": 16154 + }, + { + "epoch": 2.0550820506296907, + "grad_norm": 1.7055467367172241, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8745219707489014, + "num_tokens": 616298042.0, + "step": 16155 + }, + { + "epoch": 2.0552092609082813, + "grad_norm": 1.5418074131011963, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.898585319519043, + "num_tokens": 616335642.0, + "step": 16156 + }, + { + "epoch": 2.055336471186872, + "grad_norm": 1.6463563442230225, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8729865550994873, + "num_tokens": 616370194.0, + "step": 16157 + }, + { + "epoch": 2.0554636814654623, + "grad_norm": 1.6083236932754517, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.8936910033226013, + "num_tokens": 616408151.0, + "step": 16158 + }, + { + "epoch": 2.055590891744053, + "grad_norm": 1.367502212524414, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8910555839538574, + "num_tokens": 616455049.0, + "step": 16159 + }, + { + "epoch": 2.0557181020226434, + "grad_norm": 1.593059778213501, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8827119469642639, + "num_tokens": 616492790.0, + "step": 16160 + }, + { + "epoch": 2.055845312301234, + "grad_norm": 1.6874027252197266, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8730638027191162, + "num_tokens": 616529837.0, + "step": 16161 + }, + { + "epoch": 2.0559725225798244, + "grad_norm": 1.4022160768508911, + "learning_rate": 1e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.8963662385940552, + "num_tokens": 616568897.0, + "step": 16162 + }, + { + "epoch": 2.056099732858415, + "grad_norm": 1.9087673425674438, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8860992789268494, + "num_tokens": 616606477.0, + "step": 16163 + }, + { + "epoch": 2.0562269431370055, + "grad_norm": 1.463046908378601, + "learning_rate": 1e-06, + "loss": 0.2652, + "mean_token_accuracy": 0.9047746658325195, + "num_tokens": 616643133.0, + "step": 16164 + }, + { + "epoch": 2.056354153415596, + "grad_norm": 1.6674880981445312, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8849086761474609, + "num_tokens": 616673324.0, + "step": 16165 + }, + { + "epoch": 2.0564813636941865, + "grad_norm": 1.6614997386932373, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8803000450134277, + "num_tokens": 616707286.0, + "step": 16166 + }, + { + "epoch": 2.056608573972777, + "grad_norm": 1.4743249416351318, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8914148807525635, + "num_tokens": 616749459.0, + "step": 16167 + }, + { + "epoch": 2.0567357842513676, + "grad_norm": 1.4353420734405518, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8768622875213623, + "num_tokens": 616794995.0, + "step": 16168 + }, + { + "epoch": 2.056862994529958, + "grad_norm": 1.5401172637939453, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.8965655565261841, + "num_tokens": 616834624.0, + "step": 16169 + }, + { + "epoch": 2.0569902048085487, + "grad_norm": 1.5616737604141235, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8883123397827148, + "num_tokens": 616871340.0, + "step": 16170 + }, + { + "epoch": 2.057117415087139, + "grad_norm": 1.6544773578643799, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8832983374595642, + "num_tokens": 616906880.0, + "step": 16171 + }, + { + "epoch": 2.0572446253657297, + "grad_norm": 1.6285127401351929, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8877310752868652, + "num_tokens": 616943441.0, + "step": 16172 + }, + { + "epoch": 2.0573718356443202, + "grad_norm": 1.5080991983413696, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8868609070777893, + "num_tokens": 616983157.0, + "step": 16173 + }, + { + "epoch": 2.0574990459229108, + "grad_norm": 1.6877241134643555, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8801658153533936, + "num_tokens": 617021133.0, + "step": 16174 + }, + { + "epoch": 2.057626256201501, + "grad_norm": 1.560153603553772, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8904008865356445, + "num_tokens": 617058741.0, + "step": 16175 + }, + { + "epoch": 2.0577534664800914, + "grad_norm": 1.628373622894287, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8731476664543152, + "num_tokens": 617097179.0, + "step": 16176 + }, + { + "epoch": 2.057880676758682, + "grad_norm": 1.435138463973999, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8912736177444458, + "num_tokens": 617137667.0, + "step": 16177 + }, + { + "epoch": 2.0580078870372724, + "grad_norm": 1.5664690732955933, + "learning_rate": 1e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.8952062129974365, + "num_tokens": 617178972.0, + "step": 16178 + }, + { + "epoch": 2.058135097315863, + "grad_norm": 1.6884230375289917, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8898891806602478, + "num_tokens": 617215039.0, + "step": 16179 + }, + { + "epoch": 2.0582623075944535, + "grad_norm": 1.5009759664535522, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8797894716262817, + "num_tokens": 617255495.0, + "step": 16180 + }, + { + "epoch": 2.058389517873044, + "grad_norm": 1.5458488464355469, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.880757749080658, + "num_tokens": 617294600.0, + "step": 16181 + }, + { + "epoch": 2.0585167281516346, + "grad_norm": 1.494234323501587, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8852311372756958, + "num_tokens": 617332842.0, + "step": 16182 + }, + { + "epoch": 2.058643938430225, + "grad_norm": 1.5492724180221558, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8892139196395874, + "num_tokens": 617371831.0, + "step": 16183 + }, + { + "epoch": 2.0587711487088156, + "grad_norm": 1.579554796218872, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8828327655792236, + "num_tokens": 617409256.0, + "step": 16184 + }, + { + "epoch": 2.058898358987406, + "grad_norm": 1.5852763652801514, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8947674632072449, + "num_tokens": 617444963.0, + "step": 16185 + }, + { + "epoch": 2.0590255692659967, + "grad_norm": 1.407196283340454, + "learning_rate": 1e-06, + "loss": 0.2784, + "mean_token_accuracy": 0.8994562029838562, + "num_tokens": 617489030.0, + "step": 16186 + }, + { + "epoch": 2.059152779544587, + "grad_norm": 1.5160390138626099, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8846402168273926, + "num_tokens": 617526136.0, + "step": 16187 + }, + { + "epoch": 2.0592799898231777, + "grad_norm": 1.6345508098602295, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8739515542984009, + "num_tokens": 617564245.0, + "step": 16188 + }, + { + "epoch": 2.0594072001017683, + "grad_norm": 1.6046885251998901, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8929100036621094, + "num_tokens": 617600739.0, + "step": 16189 + }, + { + "epoch": 2.059534410380359, + "grad_norm": 1.5318881273269653, + "learning_rate": 1e-06, + "loss": 0.2858, + "mean_token_accuracy": 0.896163284778595, + "num_tokens": 617636707.0, + "step": 16190 + }, + { + "epoch": 2.0596616206589493, + "grad_norm": 1.585188865661621, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8862067461013794, + "num_tokens": 617672607.0, + "step": 16191 + }, + { + "epoch": 2.05978883093754, + "grad_norm": 1.495854139328003, + "learning_rate": 1e-06, + "loss": 0.2888, + "mean_token_accuracy": 0.8946741223335266, + "num_tokens": 617711854.0, + "step": 16192 + }, + { + "epoch": 2.0599160412161304, + "grad_norm": 1.6777501106262207, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8820290565490723, + "num_tokens": 617745299.0, + "step": 16193 + }, + { + "epoch": 2.060043251494721, + "grad_norm": 1.73539137840271, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8757653832435608, + "num_tokens": 617775714.0, + "step": 16194 + }, + { + "epoch": 2.0601704617733114, + "grad_norm": 1.6323840618133545, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8799436092376709, + "num_tokens": 617812566.0, + "step": 16195 + }, + { + "epoch": 2.060297672051902, + "grad_norm": 1.7441492080688477, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8870967030525208, + "num_tokens": 617844248.0, + "step": 16196 + }, + { + "epoch": 2.0604248823304925, + "grad_norm": 1.5994582176208496, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8899341821670532, + "num_tokens": 617877448.0, + "step": 16197 + }, + { + "epoch": 2.060552092609083, + "grad_norm": 1.5600814819335938, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8967170715332031, + "num_tokens": 617912051.0, + "step": 16198 + }, + { + "epoch": 2.0606793028876735, + "grad_norm": 1.5766692161560059, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.873349130153656, + "num_tokens": 617950287.0, + "step": 16199 + }, + { + "epoch": 2.0608065131662636, + "grad_norm": 1.4536837339401245, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8878250122070312, + "num_tokens": 617993778.0, + "step": 16200 + }, + { + "epoch": 2.060933723444854, + "grad_norm": 1.630019187927246, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8719993829727173, + "num_tokens": 618032157.0, + "step": 16201 + }, + { + "epoch": 2.0610609337234447, + "grad_norm": 1.5404118299484253, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8822499513626099, + "num_tokens": 618073696.0, + "step": 16202 + }, + { + "epoch": 2.061188144002035, + "grad_norm": 1.4490498304367065, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8820069432258606, + "num_tokens": 618115360.0, + "step": 16203 + }, + { + "epoch": 2.0613153542806257, + "grad_norm": 1.6389437913894653, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8833092451095581, + "num_tokens": 618153774.0, + "step": 16204 + }, + { + "epoch": 2.0614425645592163, + "grad_norm": 1.603365421295166, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8786128759384155, + "num_tokens": 618193875.0, + "step": 16205 + }, + { + "epoch": 2.061569774837807, + "grad_norm": 1.8432409763336182, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.873370349407196, + "num_tokens": 618224280.0, + "step": 16206 + }, + { + "epoch": 2.0616969851163973, + "grad_norm": 1.54291832447052, + "learning_rate": 1e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.8937294483184814, + "num_tokens": 618260366.0, + "step": 16207 + }, + { + "epoch": 2.061824195394988, + "grad_norm": 1.4312652349472046, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8903053402900696, + "num_tokens": 618301467.0, + "step": 16208 + }, + { + "epoch": 2.0619514056735784, + "grad_norm": 1.559407353401184, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.880420446395874, + "num_tokens": 618341075.0, + "step": 16209 + }, + { + "epoch": 2.062078615952169, + "grad_norm": 1.3835512399673462, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8945097327232361, + "num_tokens": 618383623.0, + "step": 16210 + }, + { + "epoch": 2.0622058262307594, + "grad_norm": 1.4893372058868408, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8839991688728333, + "num_tokens": 618428068.0, + "step": 16211 + }, + { + "epoch": 2.06233303650935, + "grad_norm": 1.610352635383606, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8871120810508728, + "num_tokens": 618460959.0, + "step": 16212 + }, + { + "epoch": 2.0624602467879405, + "grad_norm": 1.635575532913208, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.882840633392334, + "num_tokens": 618502414.0, + "step": 16213 + }, + { + "epoch": 2.062587457066531, + "grad_norm": 1.499606966972351, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.898772120475769, + "num_tokens": 618540318.0, + "step": 16214 + }, + { + "epoch": 2.0627146673451215, + "grad_norm": 1.4988034963607788, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8786440491676331, + "num_tokens": 618582165.0, + "step": 16215 + }, + { + "epoch": 2.062841877623712, + "grad_norm": 1.4684489965438843, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8859372138977051, + "num_tokens": 618623302.0, + "step": 16216 + }, + { + "epoch": 2.0629690879023026, + "grad_norm": 1.591280460357666, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8945742249488831, + "num_tokens": 618658382.0, + "step": 16217 + }, + { + "epoch": 2.063096298180893, + "grad_norm": 1.4833953380584717, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.89137202501297, + "num_tokens": 618695755.0, + "step": 16218 + }, + { + "epoch": 2.0632235084594837, + "grad_norm": 1.560105323791504, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8896939754486084, + "num_tokens": 618730569.0, + "step": 16219 + }, + { + "epoch": 2.063350718738074, + "grad_norm": 1.7355889081954956, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8650071024894714, + "num_tokens": 618762435.0, + "step": 16220 + }, + { + "epoch": 2.0634779290166647, + "grad_norm": 1.5936726331710815, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8870667219161987, + "num_tokens": 618798472.0, + "step": 16221 + }, + { + "epoch": 2.0636051392952552, + "grad_norm": 1.6522871255874634, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.87653648853302, + "num_tokens": 618835617.0, + "step": 16222 + }, + { + "epoch": 2.0637323495738458, + "grad_norm": 1.5678842067718506, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8831279277801514, + "num_tokens": 618872604.0, + "step": 16223 + }, + { + "epoch": 2.0638595598524363, + "grad_norm": 1.4896916151046753, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8814681768417358, + "num_tokens": 618917171.0, + "step": 16224 + }, + { + "epoch": 2.0639867701310264, + "grad_norm": 1.5954171419143677, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8853689432144165, + "num_tokens": 618952967.0, + "step": 16225 + }, + { + "epoch": 2.064113980409617, + "grad_norm": 1.5315711498260498, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8909101486206055, + "num_tokens": 618989980.0, + "step": 16226 + }, + { + "epoch": 2.0642411906882074, + "grad_norm": 1.4485864639282227, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8899326920509338, + "num_tokens": 619028099.0, + "step": 16227 + }, + { + "epoch": 2.064368400966798, + "grad_norm": 1.5740935802459717, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8736258745193481, + "num_tokens": 619069899.0, + "step": 16228 + }, + { + "epoch": 2.0644956112453885, + "grad_norm": 1.629708170890808, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8816977739334106, + "num_tokens": 619109588.0, + "step": 16229 + }, + { + "epoch": 2.064622821523979, + "grad_norm": 1.5758246183395386, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8777620792388916, + "num_tokens": 619148156.0, + "step": 16230 + }, + { + "epoch": 2.0647500318025696, + "grad_norm": 1.7498893737792969, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8804473280906677, + "num_tokens": 619181414.0, + "step": 16231 + }, + { + "epoch": 2.06487724208116, + "grad_norm": 1.4908517599105835, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8765363693237305, + "num_tokens": 619224616.0, + "step": 16232 + }, + { + "epoch": 2.0650044523597506, + "grad_norm": 1.599380373954773, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8783402442932129, + "num_tokens": 619263742.0, + "step": 16233 + }, + { + "epoch": 2.065131662638341, + "grad_norm": 1.4057689905166626, + "learning_rate": 1e-06, + "loss": 0.274, + "mean_token_accuracy": 0.9003239870071411, + "num_tokens": 619304349.0, + "step": 16234 + }, + { + "epoch": 2.0652588729169317, + "grad_norm": 1.5821595191955566, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8865503072738647, + "num_tokens": 619340272.0, + "step": 16235 + }, + { + "epoch": 2.065386083195522, + "grad_norm": 1.5288723707199097, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8897838592529297, + "num_tokens": 619378474.0, + "step": 16236 + }, + { + "epoch": 2.0655132934741127, + "grad_norm": 1.6266911029815674, + "learning_rate": 1e-06, + "loss": 0.2749, + "mean_token_accuracy": 0.8994318246841431, + "num_tokens": 619408255.0, + "step": 16237 + }, + { + "epoch": 2.0656405037527032, + "grad_norm": 1.6041550636291504, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8908429145812988, + "num_tokens": 619443655.0, + "step": 16238 + }, + { + "epoch": 2.065767714031294, + "grad_norm": 1.6406058073043823, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8750744462013245, + "num_tokens": 619482280.0, + "step": 16239 + }, + { + "epoch": 2.0658949243098843, + "grad_norm": 1.5168533325195312, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8839365243911743, + "num_tokens": 619522197.0, + "step": 16240 + }, + { + "epoch": 2.066022134588475, + "grad_norm": 1.6921473741531372, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8812936544418335, + "num_tokens": 619558554.0, + "step": 16241 + }, + { + "epoch": 2.0661493448670654, + "grad_norm": 1.4807486534118652, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8932989835739136, + "num_tokens": 619598909.0, + "step": 16242 + }, + { + "epoch": 2.066276555145656, + "grad_norm": 1.7849433422088623, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8790900707244873, + "num_tokens": 619633825.0, + "step": 16243 + }, + { + "epoch": 2.0664037654242464, + "grad_norm": 1.4914542436599731, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8734415173530579, + "num_tokens": 619679679.0, + "step": 16244 + }, + { + "epoch": 2.066530975702837, + "grad_norm": 1.7847931385040283, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8656063675880432, + "num_tokens": 619713471.0, + "step": 16245 + }, + { + "epoch": 2.0666581859814275, + "grad_norm": 1.4612799882888794, + "learning_rate": 1e-06, + "loss": 0.2679, + "mean_token_accuracy": 0.9025823473930359, + "num_tokens": 619752451.0, + "step": 16246 + }, + { + "epoch": 2.066785396260018, + "grad_norm": 1.5527150630950928, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.871789813041687, + "num_tokens": 619793197.0, + "step": 16247 + }, + { + "epoch": 2.0669126065386085, + "grad_norm": 1.446945071220398, + "learning_rate": 1e-06, + "loss": 0.269, + "mean_token_accuracy": 0.9009538888931274, + "num_tokens": 619831340.0, + "step": 16248 + }, + { + "epoch": 2.0670398168171986, + "grad_norm": 1.4584559202194214, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8764452934265137, + "num_tokens": 619873096.0, + "step": 16249 + }, + { + "epoch": 2.067167027095789, + "grad_norm": 1.664101004600525, + "learning_rate": 1e-06, + "loss": 0.284, + "mean_token_accuracy": 0.897317111492157, + "num_tokens": 619905944.0, + "step": 16250 + }, + { + "epoch": 2.0672942373743797, + "grad_norm": 1.712337851524353, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8817175626754761, + "num_tokens": 619940519.0, + "step": 16251 + }, + { + "epoch": 2.06742144765297, + "grad_norm": 1.6889232397079468, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8794936537742615, + "num_tokens": 619977639.0, + "step": 16252 + }, + { + "epoch": 2.0675486579315607, + "grad_norm": 1.4762396812438965, + "learning_rate": 1e-06, + "loss": 0.2677, + "mean_token_accuracy": 0.905186653137207, + "num_tokens": 620015121.0, + "step": 16253 + }, + { + "epoch": 2.0676758682101513, + "grad_norm": 1.5498125553131104, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8864131569862366, + "num_tokens": 620052134.0, + "step": 16254 + }, + { + "epoch": 2.067803078488742, + "grad_norm": 1.658270001411438, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.87937331199646, + "num_tokens": 620089618.0, + "step": 16255 + }, + { + "epoch": 2.0679302887673323, + "grad_norm": 1.4372868537902832, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8875665664672852, + "num_tokens": 620134180.0, + "step": 16256 + }, + { + "epoch": 2.068057499045923, + "grad_norm": 1.7005164623260498, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8871200084686279, + "num_tokens": 620167853.0, + "step": 16257 + }, + { + "epoch": 2.0681847093245134, + "grad_norm": 1.755873680114746, + "learning_rate": 1e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.8976539373397827, + "num_tokens": 620198174.0, + "step": 16258 + }, + { + "epoch": 2.068311919603104, + "grad_norm": 1.5084534883499146, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.881754994392395, + "num_tokens": 620243236.0, + "step": 16259 + }, + { + "epoch": 2.0684391298816944, + "grad_norm": 1.7168656587600708, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8615787029266357, + "num_tokens": 620279440.0, + "step": 16260 + }, + { + "epoch": 2.068566340160285, + "grad_norm": 1.6485493183135986, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8924480080604553, + "num_tokens": 620316977.0, + "step": 16261 + }, + { + "epoch": 2.0686935504388755, + "grad_norm": 1.6945770978927612, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8736854195594788, + "num_tokens": 620354797.0, + "step": 16262 + }, + { + "epoch": 2.068820760717466, + "grad_norm": 1.6163328886032104, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8809535503387451, + "num_tokens": 620392885.0, + "step": 16263 + }, + { + "epoch": 2.0689479709960565, + "grad_norm": 1.603174090385437, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8900492787361145, + "num_tokens": 620430910.0, + "step": 16264 + }, + { + "epoch": 2.069075181274647, + "grad_norm": 1.639417290687561, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8742587566375732, + "num_tokens": 620469435.0, + "step": 16265 + }, + { + "epoch": 2.0692023915532376, + "grad_norm": 1.7235313653945923, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8837159872055054, + "num_tokens": 620502146.0, + "step": 16266 + }, + { + "epoch": 2.069329601831828, + "grad_norm": 1.6639161109924316, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8858810663223267, + "num_tokens": 620540717.0, + "step": 16267 + }, + { + "epoch": 2.0694568121104187, + "grad_norm": 1.4991023540496826, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8898520469665527, + "num_tokens": 620585649.0, + "step": 16268 + }, + { + "epoch": 2.069584022389009, + "grad_norm": 1.6606965065002441, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8929558396339417, + "num_tokens": 620619744.0, + "step": 16269 + }, + { + "epoch": 2.0697112326675997, + "grad_norm": 1.6138068437576294, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8741705417633057, + "num_tokens": 620658867.0, + "step": 16270 + }, + { + "epoch": 2.0698384429461902, + "grad_norm": 1.8162983655929565, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8790134191513062, + "num_tokens": 620686652.0, + "step": 16271 + }, + { + "epoch": 2.0699656532247808, + "grad_norm": 1.373425006866455, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8916683793067932, + "num_tokens": 620729621.0, + "step": 16272 + }, + { + "epoch": 2.070092863503371, + "grad_norm": 1.7659375667572021, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8852435946464539, + "num_tokens": 620761783.0, + "step": 16273 + }, + { + "epoch": 2.0702200737819614, + "grad_norm": 1.6430648565292358, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.878819465637207, + "num_tokens": 620798521.0, + "step": 16274 + }, + { + "epoch": 2.070347284060552, + "grad_norm": 1.779733419418335, + "learning_rate": 1e-06, + "loss": 0.2731, + "mean_token_accuracy": 0.9019956588745117, + "num_tokens": 620828482.0, + "step": 16275 + }, + { + "epoch": 2.0704744943391424, + "grad_norm": 1.5856409072875977, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8947632312774658, + "num_tokens": 620864405.0, + "step": 16276 + }, + { + "epoch": 2.070601704617733, + "grad_norm": 1.4280387163162231, + "learning_rate": 1e-06, + "loss": 0.284, + "mean_token_accuracy": 0.8967143893241882, + "num_tokens": 620902972.0, + "step": 16277 + }, + { + "epoch": 2.0707289148963235, + "grad_norm": 1.6595444679260254, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8905592560768127, + "num_tokens": 620936443.0, + "step": 16278 + }, + { + "epoch": 2.070856125174914, + "grad_norm": 1.7617621421813965, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.881943941116333, + "num_tokens": 620966595.0, + "step": 16279 + }, + { + "epoch": 2.0709833354535045, + "grad_norm": 1.4762115478515625, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.892719030380249, + "num_tokens": 621007633.0, + "step": 16280 + }, + { + "epoch": 2.071110545732095, + "grad_norm": 1.6812702417373657, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8801809549331665, + "num_tokens": 621039909.0, + "step": 16281 + }, + { + "epoch": 2.0712377560106856, + "grad_norm": 1.523520588874817, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8797744512557983, + "num_tokens": 621081205.0, + "step": 16282 + }, + { + "epoch": 2.071364966289276, + "grad_norm": 1.7870796918869019, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.882864773273468, + "num_tokens": 621113805.0, + "step": 16283 + }, + { + "epoch": 2.0714921765678667, + "grad_norm": 1.4687772989273071, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.889300525188446, + "num_tokens": 621155084.0, + "step": 16284 + }, + { + "epoch": 2.071619386846457, + "grad_norm": 1.5479545593261719, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8838478326797485, + "num_tokens": 621193852.0, + "step": 16285 + }, + { + "epoch": 2.0717465971250477, + "grad_norm": 1.6131786108016968, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8688384294509888, + "num_tokens": 621233020.0, + "step": 16286 + }, + { + "epoch": 2.0718738074036382, + "grad_norm": 1.5673829317092896, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8834562301635742, + "num_tokens": 621271841.0, + "step": 16287 + }, + { + "epoch": 2.0720010176822288, + "grad_norm": 1.64373779296875, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8920559883117676, + "num_tokens": 621313150.0, + "step": 16288 + }, + { + "epoch": 2.0721282279608193, + "grad_norm": 1.5439703464508057, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8882452249526978, + "num_tokens": 621353165.0, + "step": 16289 + }, + { + "epoch": 2.07225543823941, + "grad_norm": 1.4607765674591064, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8892632126808167, + "num_tokens": 621396259.0, + "step": 16290 + }, + { + "epoch": 2.0723826485180004, + "grad_norm": 1.4948008060455322, + "learning_rate": 1e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.8980510234832764, + "num_tokens": 621435213.0, + "step": 16291 + }, + { + "epoch": 2.072509858796591, + "grad_norm": 1.7930241823196411, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8908534049987793, + "num_tokens": 621465715.0, + "step": 16292 + }, + { + "epoch": 2.0726370690751814, + "grad_norm": 1.5126045942306519, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.895873486995697, + "num_tokens": 621501263.0, + "step": 16293 + }, + { + "epoch": 2.072764279353772, + "grad_norm": 1.5518651008605957, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8845254182815552, + "num_tokens": 621538112.0, + "step": 16294 + }, + { + "epoch": 2.0728914896323625, + "grad_norm": 1.5470771789550781, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8885859251022339, + "num_tokens": 621577320.0, + "step": 16295 + }, + { + "epoch": 2.073018699910953, + "grad_norm": 1.5856971740722656, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8854587078094482, + "num_tokens": 621613959.0, + "step": 16296 + }, + { + "epoch": 2.0731459101895435, + "grad_norm": 1.6988623142242432, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8809534907341003, + "num_tokens": 621648589.0, + "step": 16297 + }, + { + "epoch": 2.0732731204681336, + "grad_norm": 1.5588761568069458, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8678668737411499, + "num_tokens": 621690080.0, + "step": 16298 + }, + { + "epoch": 2.073400330746724, + "grad_norm": 1.7047306299209595, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8872950077056885, + "num_tokens": 621722754.0, + "step": 16299 + }, + { + "epoch": 2.0735275410253147, + "grad_norm": 1.6495144367218018, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8694972395896912, + "num_tokens": 621759980.0, + "step": 16300 + }, + { + "epoch": 2.073654751303905, + "grad_norm": 1.5491998195648193, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8744696378707886, + "num_tokens": 621800978.0, + "step": 16301 + }, + { + "epoch": 2.0737819615824957, + "grad_norm": 1.5143526792526245, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.8984307050704956, + "num_tokens": 621836644.0, + "step": 16302 + }, + { + "epoch": 2.0739091718610863, + "grad_norm": 1.6708568334579468, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8952744007110596, + "num_tokens": 621866826.0, + "step": 16303 + }, + { + "epoch": 2.074036382139677, + "grad_norm": 1.5905847549438477, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8874156475067139, + "num_tokens": 621902882.0, + "step": 16304 + }, + { + "epoch": 2.0741635924182673, + "grad_norm": 1.7124295234680176, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8824057579040527, + "num_tokens": 621935170.0, + "step": 16305 + }, + { + "epoch": 2.074290802696858, + "grad_norm": 1.4584892988204956, + "learning_rate": 1e-06, + "loss": 0.287, + "mean_token_accuracy": 0.8955219984054565, + "num_tokens": 621975052.0, + "step": 16306 + }, + { + "epoch": 2.0744180129754484, + "grad_norm": 1.5212583541870117, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8871573805809021, + "num_tokens": 622014931.0, + "step": 16307 + }, + { + "epoch": 2.074545223254039, + "grad_norm": 1.6209585666656494, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8813668489456177, + "num_tokens": 622054161.0, + "step": 16308 + }, + { + "epoch": 2.0746724335326294, + "grad_norm": 1.6529630422592163, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8898570537567139, + "num_tokens": 622089379.0, + "step": 16309 + }, + { + "epoch": 2.07479964381122, + "grad_norm": 1.5453449487686157, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.883112907409668, + "num_tokens": 622125686.0, + "step": 16310 + }, + { + "epoch": 2.0749268540898105, + "grad_norm": 1.5320976972579956, + "learning_rate": 1e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.8966792821884155, + "num_tokens": 622162257.0, + "step": 16311 + }, + { + "epoch": 2.075054064368401, + "grad_norm": 1.5376924276351929, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8931635618209839, + "num_tokens": 622199969.0, + "step": 16312 + }, + { + "epoch": 2.0751812746469915, + "grad_norm": 1.4393906593322754, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.885467529296875, + "num_tokens": 622244402.0, + "step": 16313 + }, + { + "epoch": 2.075308484925582, + "grad_norm": 1.357701301574707, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8936492800712585, + "num_tokens": 622287572.0, + "step": 16314 + }, + { + "epoch": 2.0754356952041726, + "grad_norm": 1.5329097509384155, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8805122375488281, + "num_tokens": 622329104.0, + "step": 16315 + }, + { + "epoch": 2.075562905482763, + "grad_norm": 1.5675959587097168, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8888860940933228, + "num_tokens": 622367164.0, + "step": 16316 + }, + { + "epoch": 2.0756901157613536, + "grad_norm": 1.74875009059906, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.879159152507782, + "num_tokens": 622401831.0, + "step": 16317 + }, + { + "epoch": 2.075817326039944, + "grad_norm": 1.5486259460449219, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8900808095932007, + "num_tokens": 622446217.0, + "step": 16318 + }, + { + "epoch": 2.0759445363185347, + "grad_norm": 1.5073564052581787, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8921389579772949, + "num_tokens": 622482579.0, + "step": 16319 + }, + { + "epoch": 2.0760717465971252, + "grad_norm": 1.6386134624481201, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8909119963645935, + "num_tokens": 622521654.0, + "step": 16320 + }, + { + "epoch": 2.0761989568757158, + "grad_norm": 1.641444444656372, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8880442976951599, + "num_tokens": 622556821.0, + "step": 16321 + }, + { + "epoch": 2.0763261671543063, + "grad_norm": 1.6112309694290161, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8703013062477112, + "num_tokens": 622595420.0, + "step": 16322 + }, + { + "epoch": 2.0764533774328964, + "grad_norm": 1.376268982887268, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.889127254486084, + "num_tokens": 622643383.0, + "step": 16323 + }, + { + "epoch": 2.076580587711487, + "grad_norm": 1.5961265563964844, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8883665204048157, + "num_tokens": 622679780.0, + "step": 16324 + }, + { + "epoch": 2.0767077979900774, + "grad_norm": 1.7564222812652588, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8842216730117798, + "num_tokens": 622712557.0, + "step": 16325 + }, + { + "epoch": 2.076835008268668, + "grad_norm": 1.4855670928955078, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.891546368598938, + "num_tokens": 622752709.0, + "step": 16326 + }, + { + "epoch": 2.0769622185472585, + "grad_norm": 1.5687907934188843, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8888547420501709, + "num_tokens": 622790912.0, + "step": 16327 + }, + { + "epoch": 2.077089428825849, + "grad_norm": 1.6736629009246826, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8724700808525085, + "num_tokens": 622824913.0, + "step": 16328 + }, + { + "epoch": 2.0772166391044395, + "grad_norm": 1.5729891061782837, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8783938884735107, + "num_tokens": 622865954.0, + "step": 16329 + }, + { + "epoch": 2.07734384938303, + "grad_norm": 1.572347640991211, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.888264000415802, + "num_tokens": 622901392.0, + "step": 16330 + }, + { + "epoch": 2.0774710596616206, + "grad_norm": 1.4479774236679077, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8935924172401428, + "num_tokens": 622941506.0, + "step": 16331 + }, + { + "epoch": 2.077598269940211, + "grad_norm": 1.654494285583496, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.884575605392456, + "num_tokens": 622976969.0, + "step": 16332 + }, + { + "epoch": 2.0777254802188017, + "grad_norm": 1.5336343050003052, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8824795484542847, + "num_tokens": 623014818.0, + "step": 16333 + }, + { + "epoch": 2.077852690497392, + "grad_norm": 1.493500828742981, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.889674961566925, + "num_tokens": 623055186.0, + "step": 16334 + }, + { + "epoch": 2.0779799007759827, + "grad_norm": 1.7108622789382935, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8714567422866821, + "num_tokens": 623088713.0, + "step": 16335 + }, + { + "epoch": 2.0781071110545732, + "grad_norm": 1.5383296012878418, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.895624041557312, + "num_tokens": 623124674.0, + "step": 16336 + }, + { + "epoch": 2.0782343213331638, + "grad_norm": 1.5862804651260376, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8928484916687012, + "num_tokens": 623161713.0, + "step": 16337 + }, + { + "epoch": 2.0783615316117543, + "grad_norm": 1.471030831336975, + "learning_rate": 1e-06, + "loss": 0.2902, + "mean_token_accuracy": 0.8972609043121338, + "num_tokens": 623201791.0, + "step": 16338 + }, + { + "epoch": 2.078488741890345, + "grad_norm": 1.5103027820587158, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8792741894721985, + "num_tokens": 623240252.0, + "step": 16339 + }, + { + "epoch": 2.0786159521689354, + "grad_norm": 1.4312362670898438, + "learning_rate": 1e-06, + "loss": 0.2657, + "mean_token_accuracy": 0.8996330499649048, + "num_tokens": 623279882.0, + "step": 16340 + }, + { + "epoch": 2.078743162447526, + "grad_norm": 1.7326604127883911, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8726215362548828, + "num_tokens": 623315792.0, + "step": 16341 + }, + { + "epoch": 2.0788703727261164, + "grad_norm": 1.6680209636688232, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8847760558128357, + "num_tokens": 623350280.0, + "step": 16342 + }, + { + "epoch": 2.078997583004707, + "grad_norm": 1.6314421892166138, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8775161504745483, + "num_tokens": 623390348.0, + "step": 16343 + }, + { + "epoch": 2.0791247932832975, + "grad_norm": 1.5103720426559448, + "learning_rate": 1e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.8951916098594666, + "num_tokens": 623430596.0, + "step": 16344 + }, + { + "epoch": 2.079252003561888, + "grad_norm": 1.5760875940322876, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8905566930770874, + "num_tokens": 623468341.0, + "step": 16345 + }, + { + "epoch": 2.0793792138404785, + "grad_norm": 1.6025912761688232, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8926824927330017, + "num_tokens": 623501875.0, + "step": 16346 + }, + { + "epoch": 2.0795064241190686, + "grad_norm": 1.5373802185058594, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8866010904312134, + "num_tokens": 623542408.0, + "step": 16347 + }, + { + "epoch": 2.079633634397659, + "grad_norm": 1.5382000207901, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8860029578208923, + "num_tokens": 623583243.0, + "step": 16348 + }, + { + "epoch": 2.0797608446762497, + "grad_norm": 1.6107935905456543, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8776124715805054, + "num_tokens": 623626461.0, + "step": 16349 + }, + { + "epoch": 2.07988805495484, + "grad_norm": 1.5046266317367554, + "learning_rate": 1e-06, + "loss": 0.2622, + "mean_token_accuracy": 0.9065775871276855, + "num_tokens": 623660946.0, + "step": 16350 + }, + { + "epoch": 2.0800152652334307, + "grad_norm": 1.717467188835144, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8760166764259338, + "num_tokens": 623697046.0, + "step": 16351 + }, + { + "epoch": 2.0801424755120212, + "grad_norm": 1.4567421674728394, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8813061714172363, + "num_tokens": 623744063.0, + "step": 16352 + }, + { + "epoch": 2.0802696857906118, + "grad_norm": 1.508324384689331, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8871655464172363, + "num_tokens": 623785840.0, + "step": 16353 + }, + { + "epoch": 2.0803968960692023, + "grad_norm": 1.5464060306549072, + "learning_rate": 1e-06, + "loss": 0.2612, + "mean_token_accuracy": 0.9031288623809814, + "num_tokens": 623819837.0, + "step": 16354 + }, + { + "epoch": 2.080524106347793, + "grad_norm": 1.4799093008041382, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.884796679019928, + "num_tokens": 623860215.0, + "step": 16355 + }, + { + "epoch": 2.0806513166263834, + "grad_norm": 1.3702775239944458, + "learning_rate": 1e-06, + "loss": 0.272, + "mean_token_accuracy": 0.9007595777511597, + "num_tokens": 623901477.0, + "step": 16356 + }, + { + "epoch": 2.080778526904974, + "grad_norm": 1.7467434406280518, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8929018378257751, + "num_tokens": 623932174.0, + "step": 16357 + }, + { + "epoch": 2.0809057371835644, + "grad_norm": 1.6763584613800049, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8870792388916016, + "num_tokens": 623965124.0, + "step": 16358 + }, + { + "epoch": 2.081032947462155, + "grad_norm": 1.5843384265899658, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8950068950653076, + "num_tokens": 624000722.0, + "step": 16359 + }, + { + "epoch": 2.0811601577407455, + "grad_norm": 1.5633034706115723, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8867743611335754, + "num_tokens": 624039214.0, + "step": 16360 + }, + { + "epoch": 2.081287368019336, + "grad_norm": 1.517659306526184, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.886669397354126, + "num_tokens": 624078897.0, + "step": 16361 + }, + { + "epoch": 2.0814145782979265, + "grad_norm": 1.493065357208252, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8907155394554138, + "num_tokens": 624122459.0, + "step": 16362 + }, + { + "epoch": 2.081541788576517, + "grad_norm": 1.5201184749603271, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8837286233901978, + "num_tokens": 624162564.0, + "step": 16363 + }, + { + "epoch": 2.0816689988551076, + "grad_norm": 1.4029861688613892, + "learning_rate": 1e-06, + "loss": 0.2847, + "mean_token_accuracy": 0.8928707838058472, + "num_tokens": 624203586.0, + "step": 16364 + }, + { + "epoch": 2.081796209133698, + "grad_norm": 1.7074410915374756, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8866185545921326, + "num_tokens": 624243542.0, + "step": 16365 + }, + { + "epoch": 2.0819234194122886, + "grad_norm": 1.5125579833984375, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8911275267601013, + "num_tokens": 624278008.0, + "step": 16366 + }, + { + "epoch": 2.082050629690879, + "grad_norm": 1.5019997358322144, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8971145749092102, + "num_tokens": 624313681.0, + "step": 16367 + }, + { + "epoch": 2.0821778399694697, + "grad_norm": 1.6257394552230835, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8866533637046814, + "num_tokens": 624348760.0, + "step": 16368 + }, + { + "epoch": 2.0823050502480602, + "grad_norm": 1.5409718751907349, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8768555521965027, + "num_tokens": 624389778.0, + "step": 16369 + }, + { + "epoch": 2.0824322605266508, + "grad_norm": 1.5786651372909546, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8889114856719971, + "num_tokens": 624426607.0, + "step": 16370 + }, + { + "epoch": 2.082559470805241, + "grad_norm": 1.3985694646835327, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8920282125473022, + "num_tokens": 624469842.0, + "step": 16371 + }, + { + "epoch": 2.0826866810838314, + "grad_norm": 1.5285404920578003, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8729220628738403, + "num_tokens": 624513822.0, + "step": 16372 + }, + { + "epoch": 2.082813891362422, + "grad_norm": 1.5827441215515137, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8819848299026489, + "num_tokens": 624552795.0, + "step": 16373 + }, + { + "epoch": 2.0829411016410124, + "grad_norm": 1.3222260475158691, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8873937129974365, + "num_tokens": 624604904.0, + "step": 16374 + }, + { + "epoch": 2.083068311919603, + "grad_norm": 1.5709775686264038, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8923245668411255, + "num_tokens": 624639162.0, + "step": 16375 + }, + { + "epoch": 2.0831955221981935, + "grad_norm": 1.642905592918396, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8840398192405701, + "num_tokens": 624670874.0, + "step": 16376 + }, + { + "epoch": 2.083322732476784, + "grad_norm": 1.4488303661346436, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8830469250679016, + "num_tokens": 624713709.0, + "step": 16377 + }, + { + "epoch": 2.0834499427553745, + "grad_norm": 1.560957670211792, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8826714754104614, + "num_tokens": 624749893.0, + "step": 16378 + }, + { + "epoch": 2.083577153033965, + "grad_norm": 1.464409351348877, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8801173567771912, + "num_tokens": 624793455.0, + "step": 16379 + }, + { + "epoch": 2.0837043633125556, + "grad_norm": 1.5187724828720093, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8927047252655029, + "num_tokens": 624831515.0, + "step": 16380 + }, + { + "epoch": 2.083831573591146, + "grad_norm": 1.497209906578064, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8804014921188354, + "num_tokens": 624871562.0, + "step": 16381 + }, + { + "epoch": 2.0839587838697367, + "grad_norm": 1.6068161725997925, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8871080875396729, + "num_tokens": 624905198.0, + "step": 16382 + }, + { + "epoch": 2.084085994148327, + "grad_norm": 1.734188199043274, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8731400370597839, + "num_tokens": 624938367.0, + "step": 16383 + }, + { + "epoch": 2.0842132044269177, + "grad_norm": 1.447837233543396, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8905384540557861, + "num_tokens": 624979576.0, + "step": 16384 + }, + { + "epoch": 2.0843404147055082, + "grad_norm": 1.6367641687393188, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8813509941101074, + "num_tokens": 625015437.0, + "step": 16385 + }, + { + "epoch": 2.0844676249840988, + "grad_norm": 1.5890790224075317, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8782995939254761, + "num_tokens": 625053189.0, + "step": 16386 + }, + { + "epoch": 2.0845948352626893, + "grad_norm": 1.573955774307251, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8774850368499756, + "num_tokens": 625092401.0, + "step": 16387 + }, + { + "epoch": 2.08472204554128, + "grad_norm": 1.628600001335144, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8839782476425171, + "num_tokens": 625131616.0, + "step": 16388 + }, + { + "epoch": 2.0848492558198704, + "grad_norm": 1.4870338439941406, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8778201341629028, + "num_tokens": 625174274.0, + "step": 16389 + }, + { + "epoch": 2.084976466098461, + "grad_norm": 1.70071280002594, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8725906610488892, + "num_tokens": 625210449.0, + "step": 16390 + }, + { + "epoch": 2.0851036763770514, + "grad_norm": 1.5037649869918823, + "learning_rate": 1e-06, + "loss": 0.2745, + "mean_token_accuracy": 0.90069580078125, + "num_tokens": 625246762.0, + "step": 16391 + }, + { + "epoch": 2.085230886655642, + "grad_norm": 1.5950675010681152, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.881897509098053, + "num_tokens": 625287017.0, + "step": 16392 + }, + { + "epoch": 2.0853580969342325, + "grad_norm": 1.5647817850112915, + "learning_rate": 1e-06, + "loss": 0.2847, + "mean_token_accuracy": 0.8982069492340088, + "num_tokens": 625320500.0, + "step": 16393 + }, + { + "epoch": 2.085485307212823, + "grad_norm": 1.5430983304977417, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8792623281478882, + "num_tokens": 625360543.0, + "step": 16394 + }, + { + "epoch": 2.0856125174914135, + "grad_norm": 1.5471901893615723, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8815631866455078, + "num_tokens": 625401350.0, + "step": 16395 + }, + { + "epoch": 2.0857397277700036, + "grad_norm": 1.6236629486083984, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8889322876930237, + "num_tokens": 625437577.0, + "step": 16396 + }, + { + "epoch": 2.085866938048594, + "grad_norm": 1.5892550945281982, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8904874324798584, + "num_tokens": 625470208.0, + "step": 16397 + }, + { + "epoch": 2.0859941483271847, + "grad_norm": 1.6058731079101562, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8816796541213989, + "num_tokens": 625508472.0, + "step": 16398 + }, + { + "epoch": 2.086121358605775, + "grad_norm": 1.497705340385437, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8958693742752075, + "num_tokens": 625544527.0, + "step": 16399 + }, + { + "epoch": 2.0862485688843657, + "grad_norm": 1.6122161149978638, + "learning_rate": 1e-06, + "loss": 0.2607, + "mean_token_accuracy": 0.9027347564697266, + "num_tokens": 625575389.0, + "step": 16400 + }, + { + "epoch": 2.0863757791629562, + "grad_norm": 1.6719087362289429, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8678344488143921, + "num_tokens": 625609673.0, + "step": 16401 + }, + { + "epoch": 2.0865029894415468, + "grad_norm": 1.4988298416137695, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8816863298416138, + "num_tokens": 625651323.0, + "step": 16402 + }, + { + "epoch": 2.0866301997201373, + "grad_norm": 1.5929754972457886, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8783951997756958, + "num_tokens": 625690554.0, + "step": 16403 + }, + { + "epoch": 2.086757409998728, + "grad_norm": 1.4989516735076904, + "learning_rate": 1e-06, + "loss": 0.2712, + "mean_token_accuracy": 0.9043781757354736, + "num_tokens": 625728909.0, + "step": 16404 + }, + { + "epoch": 2.0868846202773184, + "grad_norm": 1.4682519435882568, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8929222226142883, + "num_tokens": 625768486.0, + "step": 16405 + }, + { + "epoch": 2.087011830555909, + "grad_norm": 1.5561779737472534, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8802560567855835, + "num_tokens": 625806756.0, + "step": 16406 + }, + { + "epoch": 2.0871390408344994, + "grad_norm": 1.6827912330627441, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8844680190086365, + "num_tokens": 625844650.0, + "step": 16407 + }, + { + "epoch": 2.08726625111309, + "grad_norm": 1.5032349824905396, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8932550549507141, + "num_tokens": 625885551.0, + "step": 16408 + }, + { + "epoch": 2.0873934613916805, + "grad_norm": 1.5262070894241333, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8848332166671753, + "num_tokens": 625924603.0, + "step": 16409 + }, + { + "epoch": 2.087520671670271, + "grad_norm": 1.5115469694137573, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8781706094741821, + "num_tokens": 625966120.0, + "step": 16410 + }, + { + "epoch": 2.0876478819488615, + "grad_norm": 1.5250725746154785, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8769754767417908, + "num_tokens": 626005642.0, + "step": 16411 + }, + { + "epoch": 2.087775092227452, + "grad_norm": 1.592270016670227, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8831459879875183, + "num_tokens": 626043695.0, + "step": 16412 + }, + { + "epoch": 2.0879023025060426, + "grad_norm": 1.5426266193389893, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8858721256256104, + "num_tokens": 626086226.0, + "step": 16413 + }, + { + "epoch": 2.088029512784633, + "grad_norm": 1.673961877822876, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8812847137451172, + "num_tokens": 626119903.0, + "step": 16414 + }, + { + "epoch": 2.0881567230632236, + "grad_norm": 1.6785520315170288, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8858570456504822, + "num_tokens": 626151812.0, + "step": 16415 + }, + { + "epoch": 2.088283933341814, + "grad_norm": 1.5927976369857788, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8651332259178162, + "num_tokens": 626195083.0, + "step": 16416 + }, + { + "epoch": 2.0884111436204047, + "grad_norm": 1.609778642654419, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.893446683883667, + "num_tokens": 626229581.0, + "step": 16417 + }, + { + "epoch": 2.0885383538989952, + "grad_norm": 1.6746307611465454, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8775904178619385, + "num_tokens": 626266100.0, + "step": 16418 + }, + { + "epoch": 2.0886655641775858, + "grad_norm": 1.5210285186767578, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8746897578239441, + "num_tokens": 626308044.0, + "step": 16419 + }, + { + "epoch": 2.0887927744561763, + "grad_norm": 1.4074227809906006, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8953019380569458, + "num_tokens": 626351560.0, + "step": 16420 + }, + { + "epoch": 2.0889199847347664, + "grad_norm": 1.5073847770690918, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8762921094894409, + "num_tokens": 626396315.0, + "step": 16421 + }, + { + "epoch": 2.089047195013357, + "grad_norm": 1.6951287984848022, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8695927858352661, + "num_tokens": 626434189.0, + "step": 16422 + }, + { + "epoch": 2.0891744052919474, + "grad_norm": 1.700380802154541, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8740482330322266, + "num_tokens": 626469053.0, + "step": 16423 + }, + { + "epoch": 2.089301615570538, + "grad_norm": 1.6434102058410645, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8787927031517029, + "num_tokens": 626506265.0, + "step": 16424 + }, + { + "epoch": 2.0894288258491285, + "grad_norm": 1.3814983367919922, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8884588479995728, + "num_tokens": 626553175.0, + "step": 16425 + }, + { + "epoch": 2.089556036127719, + "grad_norm": 1.6091759204864502, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8778022527694702, + "num_tokens": 626589104.0, + "step": 16426 + }, + { + "epoch": 2.0896832464063095, + "grad_norm": 1.5072354078292847, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8843849301338196, + "num_tokens": 626631288.0, + "step": 16427 + }, + { + "epoch": 2.0898104566849, + "grad_norm": 1.4952501058578491, + "learning_rate": 1e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.8981844782829285, + "num_tokens": 626668114.0, + "step": 16428 + }, + { + "epoch": 2.0899376669634906, + "grad_norm": 1.5548806190490723, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8896501064300537, + "num_tokens": 626706324.0, + "step": 16429 + }, + { + "epoch": 2.090064877242081, + "grad_norm": 1.6273795366287231, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8859342336654663, + "num_tokens": 626740882.0, + "step": 16430 + }, + { + "epoch": 2.0901920875206716, + "grad_norm": 1.4664275646209717, + "learning_rate": 1e-06, + "loss": 0.2679, + "mean_token_accuracy": 0.9024021029472351, + "num_tokens": 626782993.0, + "step": 16431 + }, + { + "epoch": 2.090319297799262, + "grad_norm": 1.6127992868423462, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8789207339286804, + "num_tokens": 626820553.0, + "step": 16432 + }, + { + "epoch": 2.0904465080778527, + "grad_norm": 1.4584733247756958, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8924922943115234, + "num_tokens": 626859853.0, + "step": 16433 + }, + { + "epoch": 2.0905737183564432, + "grad_norm": 1.6935240030288696, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8594874143600464, + "num_tokens": 626900263.0, + "step": 16434 + }, + { + "epoch": 2.0907009286350338, + "grad_norm": 1.7051985263824463, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8804917335510254, + "num_tokens": 626933720.0, + "step": 16435 + }, + { + "epoch": 2.0908281389136243, + "grad_norm": 1.5731208324432373, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8912139534950256, + "num_tokens": 626972066.0, + "step": 16436 + }, + { + "epoch": 2.090955349192215, + "grad_norm": 1.4565290212631226, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8870289921760559, + "num_tokens": 627015094.0, + "step": 16437 + }, + { + "epoch": 2.0910825594708053, + "grad_norm": 1.4209530353546143, + "learning_rate": 1e-06, + "loss": 0.2752, + "mean_token_accuracy": 0.8982526063919067, + "num_tokens": 627055105.0, + "step": 16438 + }, + { + "epoch": 2.091209769749396, + "grad_norm": 1.5279879570007324, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8917859792709351, + "num_tokens": 627092915.0, + "step": 16439 + }, + { + "epoch": 2.0913369800279864, + "grad_norm": 1.5596593618392944, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8698799014091492, + "num_tokens": 627131345.0, + "step": 16440 + }, + { + "epoch": 2.091464190306577, + "grad_norm": 1.4776548147201538, + "learning_rate": 1e-06, + "loss": 0.2623, + "mean_token_accuracy": 0.9042888879776001, + "num_tokens": 627168049.0, + "step": 16441 + }, + { + "epoch": 2.0915914005851675, + "grad_norm": 1.5538071393966675, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8820995688438416, + "num_tokens": 627206499.0, + "step": 16442 + }, + { + "epoch": 2.091718610863758, + "grad_norm": 1.4458763599395752, + "learning_rate": 1e-06, + "loss": 0.2754, + "mean_token_accuracy": 0.9014091491699219, + "num_tokens": 627243221.0, + "step": 16443 + }, + { + "epoch": 2.0918458211423485, + "grad_norm": 1.430978536605835, + "learning_rate": 1e-06, + "loss": 0.2769, + "mean_token_accuracy": 0.8985783457756042, + "num_tokens": 627280960.0, + "step": 16444 + }, + { + "epoch": 2.0919730314209386, + "grad_norm": 1.653478980064392, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8642347455024719, + "num_tokens": 627321854.0, + "step": 16445 + }, + { + "epoch": 2.092100241699529, + "grad_norm": 1.531298041343689, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8892872333526611, + "num_tokens": 627358340.0, + "step": 16446 + }, + { + "epoch": 2.0922274519781197, + "grad_norm": 1.487930417060852, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8864562511444092, + "num_tokens": 627397053.0, + "step": 16447 + }, + { + "epoch": 2.09235466225671, + "grad_norm": 1.843444585800171, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8768737316131592, + "num_tokens": 627433694.0, + "step": 16448 + }, + { + "epoch": 2.0924818725353007, + "grad_norm": 1.601340413093567, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8729369640350342, + "num_tokens": 627473412.0, + "step": 16449 + }, + { + "epoch": 2.0926090828138912, + "grad_norm": 1.5397367477416992, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8762433528900146, + "num_tokens": 627512324.0, + "step": 16450 + }, + { + "epoch": 2.0927362930924818, + "grad_norm": 1.580507755279541, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8823392391204834, + "num_tokens": 627551478.0, + "step": 16451 + }, + { + "epoch": 2.0928635033710723, + "grad_norm": 1.5101596117019653, + "learning_rate": 1e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.9006147384643555, + "num_tokens": 627589775.0, + "step": 16452 + }, + { + "epoch": 2.092990713649663, + "grad_norm": 1.688088059425354, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8742685317993164, + "num_tokens": 627628382.0, + "step": 16453 + }, + { + "epoch": 2.0931179239282534, + "grad_norm": 1.540662169456482, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8694568872451782, + "num_tokens": 627670244.0, + "step": 16454 + }, + { + "epoch": 2.093245134206844, + "grad_norm": 1.6263680458068848, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8914333581924438, + "num_tokens": 627709293.0, + "step": 16455 + }, + { + "epoch": 2.0933723444854344, + "grad_norm": 1.581206202507019, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8894102573394775, + "num_tokens": 627745556.0, + "step": 16456 + }, + { + "epoch": 2.093499554764025, + "grad_norm": 1.5867198705673218, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8764722347259521, + "num_tokens": 627785965.0, + "step": 16457 + }, + { + "epoch": 2.0936267650426155, + "grad_norm": 1.5498664379119873, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.8955676555633545, + "num_tokens": 627823211.0, + "step": 16458 + }, + { + "epoch": 2.093753975321206, + "grad_norm": 1.571937918663025, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.888828694820404, + "num_tokens": 627860368.0, + "step": 16459 + }, + { + "epoch": 2.0938811855997965, + "grad_norm": 1.6710697412490845, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8855681419372559, + "num_tokens": 627897595.0, + "step": 16460 + }, + { + "epoch": 2.094008395878387, + "grad_norm": 1.5578891038894653, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8931208848953247, + "num_tokens": 627935957.0, + "step": 16461 + }, + { + "epoch": 2.0941356061569776, + "grad_norm": 1.4486006498336792, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8782626986503601, + "num_tokens": 627978748.0, + "step": 16462 + }, + { + "epoch": 2.094262816435568, + "grad_norm": 1.6075687408447266, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8668509721755981, + "num_tokens": 628017658.0, + "step": 16463 + }, + { + "epoch": 2.0943900267141586, + "grad_norm": 1.5328197479248047, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8842999935150146, + "num_tokens": 628058764.0, + "step": 16464 + }, + { + "epoch": 2.094517236992749, + "grad_norm": 1.5352942943572998, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8891885280609131, + "num_tokens": 628093136.0, + "step": 16465 + }, + { + "epoch": 2.0946444472713397, + "grad_norm": 1.6193981170654297, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8784492015838623, + "num_tokens": 628131559.0, + "step": 16466 + }, + { + "epoch": 2.09477165754993, + "grad_norm": 1.4790066480636597, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8965950608253479, + "num_tokens": 628171627.0, + "step": 16467 + }, + { + "epoch": 2.0948988678285207, + "grad_norm": 1.5646051168441772, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.883037805557251, + "num_tokens": 628210599.0, + "step": 16468 + }, + { + "epoch": 2.095026078107111, + "grad_norm": 1.5717084407806396, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8882795572280884, + "num_tokens": 628247709.0, + "step": 16469 + }, + { + "epoch": 2.0951532883857014, + "grad_norm": 1.5337402820587158, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8852313756942749, + "num_tokens": 628285482.0, + "step": 16470 + }, + { + "epoch": 2.095280498664292, + "grad_norm": 1.79703950881958, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8697436451911926, + "num_tokens": 628314250.0, + "step": 16471 + }, + { + "epoch": 2.0954077089428824, + "grad_norm": 1.5764830112457275, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8763250112533569, + "num_tokens": 628354581.0, + "step": 16472 + }, + { + "epoch": 2.095534919221473, + "grad_norm": 1.6149592399597168, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8835626840591431, + "num_tokens": 628388434.0, + "step": 16473 + }, + { + "epoch": 2.0956621295000635, + "grad_norm": 1.5621356964111328, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.880067765712738, + "num_tokens": 628429158.0, + "step": 16474 + }, + { + "epoch": 2.095789339778654, + "grad_norm": 1.6956979036331177, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8790485262870789, + "num_tokens": 628461749.0, + "step": 16475 + }, + { + "epoch": 2.0959165500572445, + "grad_norm": 1.4938774108886719, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8919325470924377, + "num_tokens": 628501214.0, + "step": 16476 + }, + { + "epoch": 2.096043760335835, + "grad_norm": 1.668325662612915, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8905027508735657, + "num_tokens": 628532870.0, + "step": 16477 + }, + { + "epoch": 2.0961709706144256, + "grad_norm": 1.5496387481689453, + "learning_rate": 1e-06, + "loss": 0.2751, + "mean_token_accuracy": 0.8998149633407593, + "num_tokens": 628571011.0, + "step": 16478 + }, + { + "epoch": 2.096298180893016, + "grad_norm": 1.7434031963348389, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8823609352111816, + "num_tokens": 628604290.0, + "step": 16479 + }, + { + "epoch": 2.0964253911716066, + "grad_norm": 1.4670624732971191, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8901846408843994, + "num_tokens": 628649435.0, + "step": 16480 + }, + { + "epoch": 2.096552601450197, + "grad_norm": 1.6010513305664062, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8889317512512207, + "num_tokens": 628691239.0, + "step": 16481 + }, + { + "epoch": 2.0966798117287877, + "grad_norm": 1.6544349193572998, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8801433444023132, + "num_tokens": 628725001.0, + "step": 16482 + }, + { + "epoch": 2.0968070220073782, + "grad_norm": 1.483925461769104, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8943278193473816, + "num_tokens": 628762955.0, + "step": 16483 + }, + { + "epoch": 2.0969342322859688, + "grad_norm": 1.5998785495758057, + "learning_rate": 1e-06, + "loss": 0.2804, + "mean_token_accuracy": 0.9004075527191162, + "num_tokens": 628798972.0, + "step": 16484 + }, + { + "epoch": 2.0970614425645593, + "grad_norm": 1.5785293579101562, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.888799786567688, + "num_tokens": 628837099.0, + "step": 16485 + }, + { + "epoch": 2.09718865284315, + "grad_norm": 1.4908709526062012, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8930524587631226, + "num_tokens": 628877291.0, + "step": 16486 + }, + { + "epoch": 2.0973158631217403, + "grad_norm": 1.5069146156311035, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8946332931518555, + "num_tokens": 628916165.0, + "step": 16487 + }, + { + "epoch": 2.097443073400331, + "grad_norm": 1.5059890747070312, + "learning_rate": 1e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.8974526524543762, + "num_tokens": 628954701.0, + "step": 16488 + }, + { + "epoch": 2.0975702836789214, + "grad_norm": 1.5405327081680298, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8928842544555664, + "num_tokens": 628992893.0, + "step": 16489 + }, + { + "epoch": 2.097697493957512, + "grad_norm": 1.5064235925674438, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8815815448760986, + "num_tokens": 629034818.0, + "step": 16490 + }, + { + "epoch": 2.0978247042361025, + "grad_norm": 1.4190690517425537, + "learning_rate": 1e-06, + "loss": 0.242, + "mean_token_accuracy": 0.9111291170120239, + "num_tokens": 629072681.0, + "step": 16491 + }, + { + "epoch": 2.097951914514693, + "grad_norm": 1.70539391040802, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.87239670753479, + "num_tokens": 629109598.0, + "step": 16492 + }, + { + "epoch": 2.0980791247932835, + "grad_norm": 1.5363640785217285, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8951719403266907, + "num_tokens": 629148961.0, + "step": 16493 + }, + { + "epoch": 2.0982063350718736, + "grad_norm": 1.5220707654953003, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8787559270858765, + "num_tokens": 629194405.0, + "step": 16494 + }, + { + "epoch": 2.098333545350464, + "grad_norm": 1.5557976961135864, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8923620581626892, + "num_tokens": 629230755.0, + "step": 16495 + }, + { + "epoch": 2.0984607556290547, + "grad_norm": 1.6466625928878784, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.878218412399292, + "num_tokens": 629269145.0, + "step": 16496 + }, + { + "epoch": 2.098587965907645, + "grad_norm": 1.4465736150741577, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8856300711631775, + "num_tokens": 629311252.0, + "step": 16497 + }, + { + "epoch": 2.0987151761862357, + "grad_norm": 1.586544394493103, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8710972666740417, + "num_tokens": 629353590.0, + "step": 16498 + }, + { + "epoch": 2.0988423864648262, + "grad_norm": 1.5280640125274658, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8899727463722229, + "num_tokens": 629393641.0, + "step": 16499 + }, + { + "epoch": 2.0989695967434168, + "grad_norm": 1.5539265871047974, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8767938017845154, + "num_tokens": 629432438.0, + "step": 16500 + }, + { + "epoch": 2.0990968070220073, + "grad_norm": 1.461362361907959, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8904666900634766, + "num_tokens": 629475919.0, + "step": 16501 + }, + { + "epoch": 2.099224017300598, + "grad_norm": 1.5454164743423462, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8824373483657837, + "num_tokens": 629512261.0, + "step": 16502 + }, + { + "epoch": 2.0993512275791884, + "grad_norm": 1.6257323026657104, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8674934506416321, + "num_tokens": 629551607.0, + "step": 16503 + }, + { + "epoch": 2.099478437857779, + "grad_norm": 1.4722111225128174, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8859850764274597, + "num_tokens": 629593006.0, + "step": 16504 + }, + { + "epoch": 2.0996056481363694, + "grad_norm": 1.6493281126022339, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8825137615203857, + "num_tokens": 629631110.0, + "step": 16505 + }, + { + "epoch": 2.09973285841496, + "grad_norm": 1.5042344331741333, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8807225227355957, + "num_tokens": 629672627.0, + "step": 16506 + }, + { + "epoch": 2.0998600686935505, + "grad_norm": 1.447898268699646, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8891609311103821, + "num_tokens": 629715093.0, + "step": 16507 + }, + { + "epoch": 2.099987278972141, + "grad_norm": 1.4689438343048096, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8937934637069702, + "num_tokens": 629753789.0, + "step": 16508 + }, + { + "epoch": 2.1001144892507315, + "grad_norm": 1.4982163906097412, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8787055015563965, + "num_tokens": 629797753.0, + "step": 16509 + }, + { + "epoch": 2.100241699529322, + "grad_norm": 1.433684229850769, + "learning_rate": 1e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.8985919952392578, + "num_tokens": 629836147.0, + "step": 16510 + }, + { + "epoch": 2.1003689098079126, + "grad_norm": 1.6775263547897339, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8870915770530701, + "num_tokens": 629869002.0, + "step": 16511 + }, + { + "epoch": 2.100496120086503, + "grad_norm": 1.5132189989089966, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8792694807052612, + "num_tokens": 629908653.0, + "step": 16512 + }, + { + "epoch": 2.1006233303650936, + "grad_norm": 1.5165982246398926, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8901010155677795, + "num_tokens": 629944279.0, + "step": 16513 + }, + { + "epoch": 2.100750540643684, + "grad_norm": 1.6731863021850586, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8630403876304626, + "num_tokens": 629982396.0, + "step": 16514 + }, + { + "epoch": 2.1008777509222747, + "grad_norm": 1.389663815498352, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8872569799423218, + "num_tokens": 630026687.0, + "step": 16515 + }, + { + "epoch": 2.101004961200865, + "grad_norm": 1.5727365016937256, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8844678997993469, + "num_tokens": 630064176.0, + "step": 16516 + }, + { + "epoch": 2.1011321714794557, + "grad_norm": 1.5394223928451538, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8823891878128052, + "num_tokens": 630101179.0, + "step": 16517 + }, + { + "epoch": 2.1012593817580463, + "grad_norm": 1.6197227239608765, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8891122937202454, + "num_tokens": 630134913.0, + "step": 16518 + }, + { + "epoch": 2.1013865920366364, + "grad_norm": 1.5844446420669556, + "learning_rate": 1e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.8985706567764282, + "num_tokens": 630170026.0, + "step": 16519 + }, + { + "epoch": 2.101513802315227, + "grad_norm": 1.6301349401474, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.88859623670578, + "num_tokens": 630204926.0, + "step": 16520 + }, + { + "epoch": 2.1016410125938174, + "grad_norm": 1.622043251991272, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8860846161842346, + "num_tokens": 630240910.0, + "step": 16521 + }, + { + "epoch": 2.101768222872408, + "grad_norm": 1.560692310333252, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8882646560668945, + "num_tokens": 630278633.0, + "step": 16522 + }, + { + "epoch": 2.1018954331509985, + "grad_norm": 1.425418734550476, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8887353539466858, + "num_tokens": 630324421.0, + "step": 16523 + }, + { + "epoch": 2.102022643429589, + "grad_norm": 1.414143443107605, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8909491300582886, + "num_tokens": 630367814.0, + "step": 16524 + }, + { + "epoch": 2.1021498537081795, + "grad_norm": 1.6031748056411743, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8789424300193787, + "num_tokens": 630407191.0, + "step": 16525 + }, + { + "epoch": 2.10227706398677, + "grad_norm": 1.6982935667037964, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8586285710334778, + "num_tokens": 630442997.0, + "step": 16526 + }, + { + "epoch": 2.1024042742653606, + "grad_norm": 1.6289348602294922, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.892144501209259, + "num_tokens": 630479682.0, + "step": 16527 + }, + { + "epoch": 2.102531484543951, + "grad_norm": 1.5630111694335938, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8765479326248169, + "num_tokens": 630520047.0, + "step": 16528 + }, + { + "epoch": 2.1026586948225416, + "grad_norm": 1.560888409614563, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8801531791687012, + "num_tokens": 630557097.0, + "step": 16529 + }, + { + "epoch": 2.102785905101132, + "grad_norm": 1.4820901155471802, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8907322883605957, + "num_tokens": 630600024.0, + "step": 16530 + }, + { + "epoch": 2.1029131153797227, + "grad_norm": 1.6834213733673096, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8780021667480469, + "num_tokens": 630635414.0, + "step": 16531 + }, + { + "epoch": 2.1030403256583132, + "grad_norm": 1.470398187637329, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8871192932128906, + "num_tokens": 630678765.0, + "step": 16532 + }, + { + "epoch": 2.1031675359369038, + "grad_norm": 1.4777971506118774, + "learning_rate": 1e-06, + "loss": 0.287, + "mean_token_accuracy": 0.8966420888900757, + "num_tokens": 630718225.0, + "step": 16533 + }, + { + "epoch": 2.1032947462154943, + "grad_norm": 1.5834312438964844, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.884360671043396, + "num_tokens": 630754677.0, + "step": 16534 + }, + { + "epoch": 2.103421956494085, + "grad_norm": 1.6734946966171265, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8946359753608704, + "num_tokens": 630785796.0, + "step": 16535 + }, + { + "epoch": 2.1035491667726753, + "grad_norm": 1.5010509490966797, + "learning_rate": 1e-06, + "loss": 0.2553, + "mean_token_accuracy": 0.9094213843345642, + "num_tokens": 630824375.0, + "step": 16536 + }, + { + "epoch": 2.103676377051266, + "grad_norm": 1.5988593101501465, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8808116912841797, + "num_tokens": 630862980.0, + "step": 16537 + }, + { + "epoch": 2.1038035873298564, + "grad_norm": 1.5785894393920898, + "learning_rate": 1e-06, + "loss": 0.2772, + "mean_token_accuracy": 0.899446964263916, + "num_tokens": 630895935.0, + "step": 16538 + }, + { + "epoch": 2.103930797608447, + "grad_norm": 1.5683093070983887, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8760644197463989, + "num_tokens": 630938596.0, + "step": 16539 + }, + { + "epoch": 2.1040580078870375, + "grad_norm": 1.467797040939331, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8874032497406006, + "num_tokens": 630981716.0, + "step": 16540 + }, + { + "epoch": 2.104185218165628, + "grad_norm": 1.7535548210144043, + "learning_rate": 1e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.8969497680664062, + "num_tokens": 631011825.0, + "step": 16541 + }, + { + "epoch": 2.1043124284442185, + "grad_norm": 1.6200815439224243, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8876128792762756, + "num_tokens": 631051342.0, + "step": 16542 + }, + { + "epoch": 2.1044396387228086, + "grad_norm": 1.517310380935669, + "learning_rate": 1e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.8972290754318237, + "num_tokens": 631089653.0, + "step": 16543 + }, + { + "epoch": 2.104566849001399, + "grad_norm": 1.6165534257888794, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8883423805236816, + "num_tokens": 631128064.0, + "step": 16544 + }, + { + "epoch": 2.1046940592799896, + "grad_norm": 1.5497100353240967, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8898550868034363, + "num_tokens": 631168106.0, + "step": 16545 + }, + { + "epoch": 2.10482126955858, + "grad_norm": 1.547996163368225, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8689165711402893, + "num_tokens": 631213732.0, + "step": 16546 + }, + { + "epoch": 2.1049484798371707, + "grad_norm": 1.6316355466842651, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8632521629333496, + "num_tokens": 631256882.0, + "step": 16547 + }, + { + "epoch": 2.1050756901157612, + "grad_norm": 1.511535882949829, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8924091458320618, + "num_tokens": 631297373.0, + "step": 16548 + }, + { + "epoch": 2.1052029003943518, + "grad_norm": 1.6172517538070679, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8986449241638184, + "num_tokens": 631331434.0, + "step": 16549 + }, + { + "epoch": 2.1053301106729423, + "grad_norm": 1.4591468572616577, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8854314684867859, + "num_tokens": 631375596.0, + "step": 16550 + }, + { + "epoch": 2.105457320951533, + "grad_norm": 1.6413910388946533, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.890114426612854, + "num_tokens": 631409989.0, + "step": 16551 + }, + { + "epoch": 2.1055845312301233, + "grad_norm": 1.5784803628921509, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8878090381622314, + "num_tokens": 631445396.0, + "step": 16552 + }, + { + "epoch": 2.105711741508714, + "grad_norm": 1.420526385307312, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8905792236328125, + "num_tokens": 631487713.0, + "step": 16553 + }, + { + "epoch": 2.1058389517873044, + "grad_norm": 1.6807399988174438, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8849540948867798, + "num_tokens": 631522198.0, + "step": 16554 + }, + { + "epoch": 2.105966162065895, + "grad_norm": 1.5706939697265625, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8766151666641235, + "num_tokens": 631563399.0, + "step": 16555 + }, + { + "epoch": 2.1060933723444855, + "grad_norm": 1.5985991954803467, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8824220895767212, + "num_tokens": 631599886.0, + "step": 16556 + }, + { + "epoch": 2.106220582623076, + "grad_norm": 1.770721435546875, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8849815130233765, + "num_tokens": 631635892.0, + "step": 16557 + }, + { + "epoch": 2.1063477929016665, + "grad_norm": 1.59906005859375, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8928050994873047, + "num_tokens": 631670324.0, + "step": 16558 + }, + { + "epoch": 2.106475003180257, + "grad_norm": 1.609013557434082, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8916022181510925, + "num_tokens": 631705316.0, + "step": 16559 + }, + { + "epoch": 2.1066022134588476, + "grad_norm": 1.5016928911209106, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8925817608833313, + "num_tokens": 631740845.0, + "step": 16560 + }, + { + "epoch": 2.106729423737438, + "grad_norm": 1.797520399093628, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.883885383605957, + "num_tokens": 631770490.0, + "step": 16561 + }, + { + "epoch": 2.1068566340160286, + "grad_norm": 1.5146905183792114, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8886078596115112, + "num_tokens": 631811806.0, + "step": 16562 + }, + { + "epoch": 2.106983844294619, + "grad_norm": 1.6383576393127441, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8595612645149231, + "num_tokens": 631849625.0, + "step": 16563 + }, + { + "epoch": 2.1071110545732097, + "grad_norm": 1.6636608839035034, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.881895899772644, + "num_tokens": 631884302.0, + "step": 16564 + }, + { + "epoch": 2.1072382648518, + "grad_norm": 1.602684497833252, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8924819231033325, + "num_tokens": 631922676.0, + "step": 16565 + }, + { + "epoch": 2.1073654751303907, + "grad_norm": 1.5219125747680664, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8894424438476562, + "num_tokens": 631961257.0, + "step": 16566 + }, + { + "epoch": 2.107492685408981, + "grad_norm": 1.8825266361236572, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8855450749397278, + "num_tokens": 631996218.0, + "step": 16567 + }, + { + "epoch": 2.1076198956875714, + "grad_norm": 1.5917167663574219, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8866831660270691, + "num_tokens": 632033802.0, + "step": 16568 + }, + { + "epoch": 2.107747105966162, + "grad_norm": 1.5643163919448853, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.883583128452301, + "num_tokens": 632075666.0, + "step": 16569 + }, + { + "epoch": 2.1078743162447524, + "grad_norm": 1.6153379678726196, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8843533992767334, + "num_tokens": 632114389.0, + "step": 16570 + }, + { + "epoch": 2.108001526523343, + "grad_norm": 1.5963218212127686, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8914890289306641, + "num_tokens": 632146182.0, + "step": 16571 + }, + { + "epoch": 2.1081287368019335, + "grad_norm": 1.5102499723434448, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8891854286193848, + "num_tokens": 632187718.0, + "step": 16572 + }, + { + "epoch": 2.108255947080524, + "grad_norm": 1.5459308624267578, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8937971591949463, + "num_tokens": 632228697.0, + "step": 16573 + }, + { + "epoch": 2.1083831573591145, + "grad_norm": 1.6730713844299316, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8805625438690186, + "num_tokens": 632264622.0, + "step": 16574 + }, + { + "epoch": 2.108510367637705, + "grad_norm": 1.5999451875686646, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.886966347694397, + "num_tokens": 632302621.0, + "step": 16575 + }, + { + "epoch": 2.1086375779162956, + "grad_norm": 1.4442940950393677, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8917325735092163, + "num_tokens": 632344022.0, + "step": 16576 + }, + { + "epoch": 2.108764788194886, + "grad_norm": 1.5195671319961548, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8788188099861145, + "num_tokens": 632385274.0, + "step": 16577 + }, + { + "epoch": 2.1088919984734766, + "grad_norm": 1.5609829425811768, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8874397873878479, + "num_tokens": 632422716.0, + "step": 16578 + }, + { + "epoch": 2.109019208752067, + "grad_norm": 1.649643898010254, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8674724698066711, + "num_tokens": 632461251.0, + "step": 16579 + }, + { + "epoch": 2.1091464190306577, + "grad_norm": 1.6441630125045776, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8840814828872681, + "num_tokens": 632497367.0, + "step": 16580 + }, + { + "epoch": 2.109273629309248, + "grad_norm": 1.5308597087860107, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8926191926002502, + "num_tokens": 632532301.0, + "step": 16581 + }, + { + "epoch": 2.1094008395878387, + "grad_norm": 1.6673674583435059, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8694316744804382, + "num_tokens": 632568698.0, + "step": 16582 + }, + { + "epoch": 2.1095280498664293, + "grad_norm": 1.5791127681732178, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8944892883300781, + "num_tokens": 632606049.0, + "step": 16583 + }, + { + "epoch": 2.10965526014502, + "grad_norm": 1.4960548877716064, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8916610479354858, + "num_tokens": 632646974.0, + "step": 16584 + }, + { + "epoch": 2.1097824704236103, + "grad_norm": 1.7969413995742798, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.878831148147583, + "num_tokens": 632678790.0, + "step": 16585 + }, + { + "epoch": 2.109909680702201, + "grad_norm": 1.72849702835083, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8817989826202393, + "num_tokens": 632711189.0, + "step": 16586 + }, + { + "epoch": 2.1100368909807914, + "grad_norm": 1.5026875734329224, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8850575685501099, + "num_tokens": 632752875.0, + "step": 16587 + }, + { + "epoch": 2.110164101259382, + "grad_norm": 1.6797934770584106, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8750441074371338, + "num_tokens": 632794189.0, + "step": 16588 + }, + { + "epoch": 2.1102913115379724, + "grad_norm": 1.4256635904312134, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8948307633399963, + "num_tokens": 632833741.0, + "step": 16589 + }, + { + "epoch": 2.110418521816563, + "grad_norm": 1.631939172744751, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.875525712966919, + "num_tokens": 632872752.0, + "step": 16590 + }, + { + "epoch": 2.1105457320951535, + "grad_norm": 1.7184550762176514, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8840978145599365, + "num_tokens": 632905810.0, + "step": 16591 + }, + { + "epoch": 2.1106729423737436, + "grad_norm": 1.4537547826766968, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8912441730499268, + "num_tokens": 632949074.0, + "step": 16592 + }, + { + "epoch": 2.110800152652334, + "grad_norm": 1.4286339282989502, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.890724778175354, + "num_tokens": 632989879.0, + "step": 16593 + }, + { + "epoch": 2.1109273629309246, + "grad_norm": 1.5572930574417114, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8917484283447266, + "num_tokens": 633025309.0, + "step": 16594 + }, + { + "epoch": 2.111054573209515, + "grad_norm": 1.5328749418258667, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8945125937461853, + "num_tokens": 633063598.0, + "step": 16595 + }, + { + "epoch": 2.1111817834881057, + "grad_norm": 1.4217922687530518, + "learning_rate": 1e-06, + "loss": 0.2679, + "mean_token_accuracy": 0.9053512811660767, + "num_tokens": 633104177.0, + "step": 16596 + }, + { + "epoch": 2.1113089937666962, + "grad_norm": 1.507390022277832, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8824526071548462, + "num_tokens": 633145532.0, + "step": 16597 + }, + { + "epoch": 2.1114362040452868, + "grad_norm": 1.5654962062835693, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8873645067214966, + "num_tokens": 633183484.0, + "step": 16598 + }, + { + "epoch": 2.1115634143238773, + "grad_norm": 1.507706880569458, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8896081447601318, + "num_tokens": 633219917.0, + "step": 16599 + }, + { + "epoch": 2.111690624602468, + "grad_norm": 1.5996249914169312, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8892948031425476, + "num_tokens": 633251426.0, + "step": 16600 + }, + { + "epoch": 2.1118178348810583, + "grad_norm": 1.5363832712173462, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8910727500915527, + "num_tokens": 633288496.0, + "step": 16601 + }, + { + "epoch": 2.111945045159649, + "grad_norm": 1.6600244045257568, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8826521635055542, + "num_tokens": 633324537.0, + "step": 16602 + }, + { + "epoch": 2.1120722554382394, + "grad_norm": 1.7106326818466187, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8716046810150146, + "num_tokens": 633361923.0, + "step": 16603 + }, + { + "epoch": 2.11219946571683, + "grad_norm": 1.5313209295272827, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8921459913253784, + "num_tokens": 633400366.0, + "step": 16604 + }, + { + "epoch": 2.1123266759954205, + "grad_norm": 1.560031533241272, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8791462779045105, + "num_tokens": 633438879.0, + "step": 16605 + }, + { + "epoch": 2.112453886274011, + "grad_norm": 1.5344698429107666, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8876258730888367, + "num_tokens": 633476348.0, + "step": 16606 + }, + { + "epoch": 2.1125810965526015, + "grad_norm": 1.6494239568710327, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8797130584716797, + "num_tokens": 633513558.0, + "step": 16607 + }, + { + "epoch": 2.112708306831192, + "grad_norm": 1.4340088367462158, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8803600072860718, + "num_tokens": 633559344.0, + "step": 16608 + }, + { + "epoch": 2.1128355171097826, + "grad_norm": 1.5943961143493652, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8905583024024963, + "num_tokens": 633593798.0, + "step": 16609 + }, + { + "epoch": 2.112962727388373, + "grad_norm": 1.5323883295059204, + "learning_rate": 1e-06, + "loss": 0.2866, + "mean_token_accuracy": 0.8939201831817627, + "num_tokens": 633628912.0, + "step": 16610 + }, + { + "epoch": 2.1130899376669636, + "grad_norm": 1.5973844528198242, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8876063823699951, + "num_tokens": 633665925.0, + "step": 16611 + }, + { + "epoch": 2.113217147945554, + "grad_norm": 1.6391246318817139, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8727062344551086, + "num_tokens": 633702177.0, + "step": 16612 + }, + { + "epoch": 2.1133443582241447, + "grad_norm": 1.6382561922073364, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8886295557022095, + "num_tokens": 633737143.0, + "step": 16613 + }, + { + "epoch": 2.113471568502735, + "grad_norm": 1.52481210231781, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8848366141319275, + "num_tokens": 633776366.0, + "step": 16614 + }, + { + "epoch": 2.1135987787813257, + "grad_norm": 1.604059100151062, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.8945721983909607, + "num_tokens": 633809486.0, + "step": 16615 + }, + { + "epoch": 2.1137259890599163, + "grad_norm": 1.6665785312652588, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8917292356491089, + "num_tokens": 633842133.0, + "step": 16616 + }, + { + "epoch": 2.1138531993385064, + "grad_norm": 1.6819223165512085, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8870351910591125, + "num_tokens": 633879459.0, + "step": 16617 + }, + { + "epoch": 2.113980409617097, + "grad_norm": 1.573630928993225, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8824765682220459, + "num_tokens": 633924382.0, + "step": 16618 + }, + { + "epoch": 2.1141076198956874, + "grad_norm": 1.5481152534484863, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8853594064712524, + "num_tokens": 633964210.0, + "step": 16619 + }, + { + "epoch": 2.114234830174278, + "grad_norm": 1.6073696613311768, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8851126432418823, + "num_tokens": 634000388.0, + "step": 16620 + }, + { + "epoch": 2.1143620404528685, + "grad_norm": 1.5988022089004517, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8862152099609375, + "num_tokens": 634038312.0, + "step": 16621 + }, + { + "epoch": 2.114489250731459, + "grad_norm": 1.5122345685958862, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8957599401473999, + "num_tokens": 634075429.0, + "step": 16622 + }, + { + "epoch": 2.1146164610100495, + "grad_norm": 1.598123550415039, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8832787871360779, + "num_tokens": 634112522.0, + "step": 16623 + }, + { + "epoch": 2.11474367128864, + "grad_norm": 1.517387866973877, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8858290910720825, + "num_tokens": 634153352.0, + "step": 16624 + }, + { + "epoch": 2.1148708815672306, + "grad_norm": 1.582562804222107, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8889530301094055, + "num_tokens": 634188624.0, + "step": 16625 + }, + { + "epoch": 2.114998091845821, + "grad_norm": 1.457737922668457, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8804203271865845, + "num_tokens": 634227990.0, + "step": 16626 + }, + { + "epoch": 2.1151253021244116, + "grad_norm": 1.5610053539276123, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8891686201095581, + "num_tokens": 634264750.0, + "step": 16627 + }, + { + "epoch": 2.115252512403002, + "grad_norm": 1.464447021484375, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.878853440284729, + "num_tokens": 634309428.0, + "step": 16628 + }, + { + "epoch": 2.1153797226815927, + "grad_norm": 1.4855599403381348, + "learning_rate": 1e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.8954285383224487, + "num_tokens": 634347402.0, + "step": 16629 + }, + { + "epoch": 2.115506932960183, + "grad_norm": 1.5491106510162354, + "learning_rate": 1e-06, + "loss": 0.2725, + "mean_token_accuracy": 0.9006820917129517, + "num_tokens": 634379353.0, + "step": 16630 + }, + { + "epoch": 2.1156341432387737, + "grad_norm": 1.4648869037628174, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8913383483886719, + "num_tokens": 634420682.0, + "step": 16631 + }, + { + "epoch": 2.1157613535173643, + "grad_norm": 1.6098519563674927, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8802791833877563, + "num_tokens": 634461334.0, + "step": 16632 + }, + { + "epoch": 2.115888563795955, + "grad_norm": 1.5000680685043335, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8826034069061279, + "num_tokens": 634502615.0, + "step": 16633 + }, + { + "epoch": 2.1160157740745453, + "grad_norm": 1.5895394086837769, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8823177218437195, + "num_tokens": 634542788.0, + "step": 16634 + }, + { + "epoch": 2.116142984353136, + "grad_norm": 1.4330523014068604, + "learning_rate": 1e-06, + "loss": 0.2548, + "mean_token_accuracy": 0.9080783128738403, + "num_tokens": 634582847.0, + "step": 16635 + }, + { + "epoch": 2.1162701946317264, + "grad_norm": 1.5263140201568604, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8862302303314209, + "num_tokens": 634622783.0, + "step": 16636 + }, + { + "epoch": 2.116397404910317, + "grad_norm": 1.6045863628387451, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8913505673408508, + "num_tokens": 634660217.0, + "step": 16637 + }, + { + "epoch": 2.1165246151889074, + "grad_norm": 1.80585777759552, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8875906467437744, + "num_tokens": 634692722.0, + "step": 16638 + }, + { + "epoch": 2.116651825467498, + "grad_norm": 1.6489232778549194, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8657408356666565, + "num_tokens": 634732524.0, + "step": 16639 + }, + { + "epoch": 2.116779035746088, + "grad_norm": 1.622727394104004, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.871033251285553, + "num_tokens": 634774470.0, + "step": 16640 + }, + { + "epoch": 2.1169062460246786, + "grad_norm": 1.528618574142456, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8891267776489258, + "num_tokens": 634812562.0, + "step": 16641 + }, + { + "epoch": 2.117033456303269, + "grad_norm": 1.5479345321655273, + "learning_rate": 1e-06, + "loss": 0.2742, + "mean_token_accuracy": 0.901985764503479, + "num_tokens": 634846507.0, + "step": 16642 + }, + { + "epoch": 2.1171606665818596, + "grad_norm": 1.5872489213943481, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8771249055862427, + "num_tokens": 634884180.0, + "step": 16643 + }, + { + "epoch": 2.11728787686045, + "grad_norm": 1.668067455291748, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.874009370803833, + "num_tokens": 634922172.0, + "step": 16644 + }, + { + "epoch": 2.1174150871390407, + "grad_norm": 1.5913537740707397, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8783058524131775, + "num_tokens": 634960310.0, + "step": 16645 + }, + { + "epoch": 2.1175422974176312, + "grad_norm": 1.5967093706130981, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.878365159034729, + "num_tokens": 634996115.0, + "step": 16646 + }, + { + "epoch": 2.1176695076962218, + "grad_norm": 1.4837936162948608, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8810991048812866, + "num_tokens": 635036892.0, + "step": 16647 + }, + { + "epoch": 2.1177967179748123, + "grad_norm": 1.5931028127670288, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8816911578178406, + "num_tokens": 635076495.0, + "step": 16648 + }, + { + "epoch": 2.117923928253403, + "grad_norm": 1.453323483467102, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8897421956062317, + "num_tokens": 635115213.0, + "step": 16649 + }, + { + "epoch": 2.1180511385319933, + "grad_norm": 1.6637227535247803, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8881458044052124, + "num_tokens": 635146348.0, + "step": 16650 + }, + { + "epoch": 2.118178348810584, + "grad_norm": 1.656951904296875, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8730838298797607, + "num_tokens": 635180742.0, + "step": 16651 + }, + { + "epoch": 2.1183055590891744, + "grad_norm": 1.465323567390442, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8938018083572388, + "num_tokens": 635220762.0, + "step": 16652 + }, + { + "epoch": 2.118432769367765, + "grad_norm": 1.545767068862915, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8836410045623779, + "num_tokens": 635259048.0, + "step": 16653 + }, + { + "epoch": 2.1185599796463555, + "grad_norm": 1.585626482963562, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8913028240203857, + "num_tokens": 635293667.0, + "step": 16654 + }, + { + "epoch": 2.118687189924946, + "grad_norm": 1.4017601013183594, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8906625509262085, + "num_tokens": 635339117.0, + "step": 16655 + }, + { + "epoch": 2.1188144002035365, + "grad_norm": 1.44764244556427, + "learning_rate": 1e-06, + "loss": 0.292, + "mean_token_accuracy": 0.893642783164978, + "num_tokens": 635379254.0, + "step": 16656 + }, + { + "epoch": 2.118941610482127, + "grad_norm": 1.565400242805481, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8900309801101685, + "num_tokens": 635416595.0, + "step": 16657 + }, + { + "epoch": 2.1190688207607176, + "grad_norm": 1.4831593036651611, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.895277738571167, + "num_tokens": 635453722.0, + "step": 16658 + }, + { + "epoch": 2.119196031039308, + "grad_norm": 1.6089292764663696, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8786197304725647, + "num_tokens": 635492695.0, + "step": 16659 + }, + { + "epoch": 2.1193232413178986, + "grad_norm": 1.5397627353668213, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8856070041656494, + "num_tokens": 635532152.0, + "step": 16660 + }, + { + "epoch": 2.119450451596489, + "grad_norm": 1.5291582345962524, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8729958534240723, + "num_tokens": 635575181.0, + "step": 16661 + }, + { + "epoch": 2.1195776618750797, + "grad_norm": 1.482977271080017, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8911863565444946, + "num_tokens": 635618042.0, + "step": 16662 + }, + { + "epoch": 2.11970487215367, + "grad_norm": 1.560789942741394, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8907077312469482, + "num_tokens": 635653623.0, + "step": 16663 + }, + { + "epoch": 2.1198320824322607, + "grad_norm": 1.5945708751678467, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8820090889930725, + "num_tokens": 635690962.0, + "step": 16664 + }, + { + "epoch": 2.119959292710851, + "grad_norm": 1.580399751663208, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8849745392799377, + "num_tokens": 635728695.0, + "step": 16665 + }, + { + "epoch": 2.1200865029894413, + "grad_norm": 1.7105095386505127, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.883988618850708, + "num_tokens": 635762148.0, + "step": 16666 + }, + { + "epoch": 2.120213713268032, + "grad_norm": 1.6596759557724, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8808661699295044, + "num_tokens": 635797542.0, + "step": 16667 + }, + { + "epoch": 2.1203409235466224, + "grad_norm": 1.6000075340270996, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8865405321121216, + "num_tokens": 635833736.0, + "step": 16668 + }, + { + "epoch": 2.120468133825213, + "grad_norm": 1.5987460613250732, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.878757894039154, + "num_tokens": 635873378.0, + "step": 16669 + }, + { + "epoch": 2.1205953441038035, + "grad_norm": 1.5635803937911987, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8816986680030823, + "num_tokens": 635913422.0, + "step": 16670 + }, + { + "epoch": 2.120722554382394, + "grad_norm": 1.741720199584961, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8789029121398926, + "num_tokens": 635946840.0, + "step": 16671 + }, + { + "epoch": 2.1208497646609845, + "grad_norm": 1.604602336883545, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8888441324234009, + "num_tokens": 635988814.0, + "step": 16672 + }, + { + "epoch": 2.120976974939575, + "grad_norm": 1.5217254161834717, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8915111422538757, + "num_tokens": 636031524.0, + "step": 16673 + }, + { + "epoch": 2.1211041852181656, + "grad_norm": 1.6330403089523315, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.889858603477478, + "num_tokens": 636067107.0, + "step": 16674 + }, + { + "epoch": 2.121231395496756, + "grad_norm": 1.6661127805709839, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8850955963134766, + "num_tokens": 636103138.0, + "step": 16675 + }, + { + "epoch": 2.1213586057753466, + "grad_norm": 1.490830898284912, + "learning_rate": 1e-06, + "loss": 0.282, + "mean_token_accuracy": 0.9000770449638367, + "num_tokens": 636143191.0, + "step": 16676 + }, + { + "epoch": 2.121485816053937, + "grad_norm": 1.5469970703125, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.888945460319519, + "num_tokens": 636181894.0, + "step": 16677 + }, + { + "epoch": 2.1216130263325277, + "grad_norm": 1.43704092502594, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.879743218421936, + "num_tokens": 636227431.0, + "step": 16678 + }, + { + "epoch": 2.121740236611118, + "grad_norm": 1.529542088508606, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8825955390930176, + "num_tokens": 636266227.0, + "step": 16679 + }, + { + "epoch": 2.1218674468897087, + "grad_norm": 1.4867711067199707, + "learning_rate": 1e-06, + "loss": 0.2733, + "mean_token_accuracy": 0.9007903933525085, + "num_tokens": 636305831.0, + "step": 16680 + }, + { + "epoch": 2.1219946571682993, + "grad_norm": 1.4406720399856567, + "learning_rate": 1e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.8950791954994202, + "num_tokens": 636342982.0, + "step": 16681 + }, + { + "epoch": 2.12212186744689, + "grad_norm": 1.4529749155044556, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.892920732498169, + "num_tokens": 636384563.0, + "step": 16682 + }, + { + "epoch": 2.1222490777254803, + "grad_norm": 1.6465438604354858, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.890027642250061, + "num_tokens": 636419837.0, + "step": 16683 + }, + { + "epoch": 2.122376288004071, + "grad_norm": 1.5754023790359497, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8876864910125732, + "num_tokens": 636455431.0, + "step": 16684 + }, + { + "epoch": 2.1225034982826614, + "grad_norm": 1.6843335628509521, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8817762732505798, + "num_tokens": 636494345.0, + "step": 16685 + }, + { + "epoch": 2.122630708561252, + "grad_norm": 1.5389409065246582, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8795965909957886, + "num_tokens": 636536822.0, + "step": 16686 + }, + { + "epoch": 2.1227579188398424, + "grad_norm": 1.464249849319458, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8878984451293945, + "num_tokens": 636575982.0, + "step": 16687 + }, + { + "epoch": 2.122885129118433, + "grad_norm": 1.616952657699585, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8623122572898865, + "num_tokens": 636616645.0, + "step": 16688 + }, + { + "epoch": 2.1230123393970235, + "grad_norm": 1.647030234336853, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8894094228744507, + "num_tokens": 636654698.0, + "step": 16689 + }, + { + "epoch": 2.1231395496756136, + "grad_norm": 1.5694011449813843, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8899493217468262, + "num_tokens": 636690530.0, + "step": 16690 + }, + { + "epoch": 2.123266759954204, + "grad_norm": 1.4793736934661865, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8822212815284729, + "num_tokens": 636733466.0, + "step": 16691 + }, + { + "epoch": 2.1233939702327946, + "grad_norm": 1.504198670387268, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8898639678955078, + "num_tokens": 636772015.0, + "step": 16692 + }, + { + "epoch": 2.123521180511385, + "grad_norm": 1.6307939291000366, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8900843858718872, + "num_tokens": 636805579.0, + "step": 16693 + }, + { + "epoch": 2.1236483907899757, + "grad_norm": 1.4618691205978394, + "learning_rate": 1e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.8954447507858276, + "num_tokens": 636848342.0, + "step": 16694 + }, + { + "epoch": 2.123775601068566, + "grad_norm": 1.6761562824249268, + "learning_rate": 1e-06, + "loss": 0.2695, + "mean_token_accuracy": 0.9010347723960876, + "num_tokens": 636879806.0, + "step": 16695 + }, + { + "epoch": 2.1239028113471567, + "grad_norm": 1.5400824546813965, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.876543402671814, + "num_tokens": 636919364.0, + "step": 16696 + }, + { + "epoch": 2.1240300216257473, + "grad_norm": 1.4914571046829224, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8928057551383972, + "num_tokens": 636958320.0, + "step": 16697 + }, + { + "epoch": 2.124157231904338, + "grad_norm": 1.5474964380264282, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.88881516456604, + "num_tokens": 636995318.0, + "step": 16698 + }, + { + "epoch": 2.1242844421829283, + "grad_norm": 1.485753059387207, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8929561972618103, + "num_tokens": 637036852.0, + "step": 16699 + }, + { + "epoch": 2.124411652461519, + "grad_norm": 1.4744371175765991, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8911035656929016, + "num_tokens": 637079472.0, + "step": 16700 + }, + { + "epoch": 2.1245388627401094, + "grad_norm": 1.4649287462234497, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.880517840385437, + "num_tokens": 637124372.0, + "step": 16701 + }, + { + "epoch": 2.1246660730187, + "grad_norm": 1.6102443933486938, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8955153226852417, + "num_tokens": 637159979.0, + "step": 16702 + }, + { + "epoch": 2.1247932832972904, + "grad_norm": 1.4786046743392944, + "learning_rate": 1e-06, + "loss": 0.2792, + "mean_token_accuracy": 0.8981180191040039, + "num_tokens": 637200861.0, + "step": 16703 + }, + { + "epoch": 2.124920493575881, + "grad_norm": 1.42690908908844, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8873817920684814, + "num_tokens": 637245386.0, + "step": 16704 + }, + { + "epoch": 2.1250477038544715, + "grad_norm": 1.6383388042449951, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8790633678436279, + "num_tokens": 637286544.0, + "step": 16705 + }, + { + "epoch": 2.125174914133062, + "grad_norm": 1.657968282699585, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8623172640800476, + "num_tokens": 637323650.0, + "step": 16706 + }, + { + "epoch": 2.1253021244116526, + "grad_norm": 1.4687260389328003, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8891993761062622, + "num_tokens": 637361082.0, + "step": 16707 + }, + { + "epoch": 2.125429334690243, + "grad_norm": 1.4731899499893188, + "learning_rate": 1e-06, + "loss": 0.2778, + "mean_token_accuracy": 0.8998700976371765, + "num_tokens": 637399341.0, + "step": 16708 + }, + { + "epoch": 2.1255565449688336, + "grad_norm": 1.533851981163025, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8933846950531006, + "num_tokens": 637437623.0, + "step": 16709 + }, + { + "epoch": 2.125683755247424, + "grad_norm": 1.5305989980697632, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8879116773605347, + "num_tokens": 637478128.0, + "step": 16710 + }, + { + "epoch": 2.1258109655260147, + "grad_norm": 1.4822001457214355, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8929286003112793, + "num_tokens": 637517167.0, + "step": 16711 + }, + { + "epoch": 2.125938175804605, + "grad_norm": 1.5171245336532593, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8892221450805664, + "num_tokens": 637556051.0, + "step": 16712 + }, + { + "epoch": 2.1260653860831957, + "grad_norm": 1.5009229183197021, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8848783373832703, + "num_tokens": 637596990.0, + "step": 16713 + }, + { + "epoch": 2.1261925963617863, + "grad_norm": 1.5494462251663208, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8850947022438049, + "num_tokens": 637638825.0, + "step": 16714 + }, + { + "epoch": 2.1263198066403763, + "grad_norm": 1.4391121864318848, + "learning_rate": 1e-06, + "loss": 0.2735, + "mean_token_accuracy": 0.9006592631340027, + "num_tokens": 637679559.0, + "step": 16715 + }, + { + "epoch": 2.126447016918967, + "grad_norm": 1.689591407775879, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8825830221176147, + "num_tokens": 637715825.0, + "step": 16716 + }, + { + "epoch": 2.1265742271975574, + "grad_norm": 1.4856387376785278, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8928701877593994, + "num_tokens": 637758413.0, + "step": 16717 + }, + { + "epoch": 2.126701437476148, + "grad_norm": 1.6850504875183105, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8917663097381592, + "num_tokens": 637789703.0, + "step": 16718 + }, + { + "epoch": 2.1268286477547385, + "grad_norm": 1.68655526638031, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8794488310813904, + "num_tokens": 637823166.0, + "step": 16719 + }, + { + "epoch": 2.126955858033329, + "grad_norm": 1.6362574100494385, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.883525013923645, + "num_tokens": 637861994.0, + "step": 16720 + }, + { + "epoch": 2.1270830683119195, + "grad_norm": 1.5668450593948364, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8916257619857788, + "num_tokens": 637898852.0, + "step": 16721 + }, + { + "epoch": 2.12721027859051, + "grad_norm": 1.6506255865097046, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8886383771896362, + "num_tokens": 637935067.0, + "step": 16722 + }, + { + "epoch": 2.1273374888691006, + "grad_norm": 1.7679846286773682, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8826756477355957, + "num_tokens": 637969019.0, + "step": 16723 + }, + { + "epoch": 2.127464699147691, + "grad_norm": 1.4767407178878784, + "learning_rate": 1e-06, + "loss": 0.2565, + "mean_token_accuracy": 0.9066833257675171, + "num_tokens": 638005466.0, + "step": 16724 + }, + { + "epoch": 2.1275919094262816, + "grad_norm": 1.6210304498672485, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8981072306632996, + "num_tokens": 638044383.0, + "step": 16725 + }, + { + "epoch": 2.127719119704872, + "grad_norm": 1.7098345756530762, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8836173415184021, + "num_tokens": 638078259.0, + "step": 16726 + }, + { + "epoch": 2.1278463299834627, + "grad_norm": 1.727226734161377, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8923999071121216, + "num_tokens": 638112079.0, + "step": 16727 + }, + { + "epoch": 2.127973540262053, + "grad_norm": 1.4378618001937866, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8955686092376709, + "num_tokens": 638153023.0, + "step": 16728 + }, + { + "epoch": 2.1281007505406437, + "grad_norm": 1.5697410106658936, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8931989669799805, + "num_tokens": 638187685.0, + "step": 16729 + }, + { + "epoch": 2.1282279608192343, + "grad_norm": 1.6196701526641846, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8814507722854614, + "num_tokens": 638225629.0, + "step": 16730 + }, + { + "epoch": 2.128355171097825, + "grad_norm": 1.6292799711227417, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8801623582839966, + "num_tokens": 638264325.0, + "step": 16731 + }, + { + "epoch": 2.1284823813764153, + "grad_norm": 1.7136030197143555, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8629767894744873, + "num_tokens": 638304009.0, + "step": 16732 + }, + { + "epoch": 2.128609591655006, + "grad_norm": 1.5542923212051392, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.8930922150611877, + "num_tokens": 638343359.0, + "step": 16733 + }, + { + "epoch": 2.1287368019335964, + "grad_norm": 1.6127488613128662, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8772692680358887, + "num_tokens": 638379537.0, + "step": 16734 + }, + { + "epoch": 2.128864012212187, + "grad_norm": 1.6122612953186035, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8681290149688721, + "num_tokens": 638419324.0, + "step": 16735 + }, + { + "epoch": 2.1289912224907774, + "grad_norm": 1.5552551746368408, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8867167234420776, + "num_tokens": 638455567.0, + "step": 16736 + }, + { + "epoch": 2.129118432769368, + "grad_norm": 1.6343015432357788, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8894463181495667, + "num_tokens": 638490410.0, + "step": 16737 + }, + { + "epoch": 2.129245643047958, + "grad_norm": 1.5196170806884766, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8845921754837036, + "num_tokens": 638530597.0, + "step": 16738 + }, + { + "epoch": 2.129372853326549, + "grad_norm": 1.4823482036590576, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.888253927230835, + "num_tokens": 638574040.0, + "step": 16739 + }, + { + "epoch": 2.129500063605139, + "grad_norm": 1.446763515472412, + "learning_rate": 1e-06, + "loss": 0.2738, + "mean_token_accuracy": 0.898766279220581, + "num_tokens": 638612227.0, + "step": 16740 + }, + { + "epoch": 2.1296272738837296, + "grad_norm": 1.6259617805480957, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8830345869064331, + "num_tokens": 638649885.0, + "step": 16741 + }, + { + "epoch": 2.12975448416232, + "grad_norm": 1.435894250869751, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8961559534072876, + "num_tokens": 638686928.0, + "step": 16742 + }, + { + "epoch": 2.1298816944409107, + "grad_norm": 1.4967387914657593, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8846473097801208, + "num_tokens": 638726327.0, + "step": 16743 + }, + { + "epoch": 2.130008904719501, + "grad_norm": 1.4921716451644897, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8949582576751709, + "num_tokens": 638767662.0, + "step": 16744 + }, + { + "epoch": 2.1301361149980917, + "grad_norm": 1.6467734575271606, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8782724738121033, + "num_tokens": 638808981.0, + "step": 16745 + }, + { + "epoch": 2.1302633252766823, + "grad_norm": 1.6359580755233765, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8714873790740967, + "num_tokens": 638847708.0, + "step": 16746 + }, + { + "epoch": 2.130390535555273, + "grad_norm": 1.5720652341842651, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.893477201461792, + "num_tokens": 638884417.0, + "step": 16747 + }, + { + "epoch": 2.1305177458338633, + "grad_norm": 1.5751533508300781, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8922827243804932, + "num_tokens": 638920075.0, + "step": 16748 + }, + { + "epoch": 2.130644956112454, + "grad_norm": 1.4322702884674072, + "learning_rate": 1e-06, + "loss": 0.2858, + "mean_token_accuracy": 0.8972028493881226, + "num_tokens": 638959303.0, + "step": 16749 + }, + { + "epoch": 2.1307721663910444, + "grad_norm": 1.4844495058059692, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8837546706199646, + "num_tokens": 639000093.0, + "step": 16750 + }, + { + "epoch": 2.130899376669635, + "grad_norm": 1.5659202337265015, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8848715424537659, + "num_tokens": 639037799.0, + "step": 16751 + }, + { + "epoch": 2.1310265869482254, + "grad_norm": 1.6926647424697876, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8828278183937073, + "num_tokens": 639072019.0, + "step": 16752 + }, + { + "epoch": 2.131153797226816, + "grad_norm": 1.5406227111816406, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8843942880630493, + "num_tokens": 639111067.0, + "step": 16753 + }, + { + "epoch": 2.1312810075054065, + "grad_norm": 1.5309919118881226, + "learning_rate": 1e-06, + "loss": 0.2723, + "mean_token_accuracy": 0.9001266956329346, + "num_tokens": 639149363.0, + "step": 16754 + }, + { + "epoch": 2.131408217783997, + "grad_norm": 1.6492674350738525, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.891183078289032, + "num_tokens": 639184259.0, + "step": 16755 + }, + { + "epoch": 2.1315354280625876, + "grad_norm": 1.568771243095398, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8872823715209961, + "num_tokens": 639226589.0, + "step": 16756 + }, + { + "epoch": 2.131662638341178, + "grad_norm": 1.629789113998413, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8802509307861328, + "num_tokens": 639264539.0, + "step": 16757 + }, + { + "epoch": 2.1317898486197686, + "grad_norm": 1.651311993598938, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8866333961486816, + "num_tokens": 639299172.0, + "step": 16758 + }, + { + "epoch": 2.131917058898359, + "grad_norm": 1.4867300987243652, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8756647706031799, + "num_tokens": 639340401.0, + "step": 16759 + }, + { + "epoch": 2.1320442691769497, + "grad_norm": 1.3712694644927979, + "learning_rate": 1e-06, + "loss": 0.2743, + "mean_token_accuracy": 0.8999125957489014, + "num_tokens": 639384348.0, + "step": 16760 + }, + { + "epoch": 2.13217147945554, + "grad_norm": 1.6736555099487305, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8714290261268616, + "num_tokens": 639423198.0, + "step": 16761 + }, + { + "epoch": 2.1322986897341307, + "grad_norm": 1.7843643426895142, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8851141929626465, + "num_tokens": 639454424.0, + "step": 16762 + }, + { + "epoch": 2.132425900012721, + "grad_norm": 1.6571730375289917, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8789622783660889, + "num_tokens": 639492487.0, + "step": 16763 + }, + { + "epoch": 2.1325531102913113, + "grad_norm": 1.6947089433670044, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8870556354522705, + "num_tokens": 639526019.0, + "step": 16764 + }, + { + "epoch": 2.132680320569902, + "grad_norm": 1.733000636100769, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.890882134437561, + "num_tokens": 639561027.0, + "step": 16765 + }, + { + "epoch": 2.1328075308484924, + "grad_norm": 1.5432908535003662, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8952585458755493, + "num_tokens": 639598174.0, + "step": 16766 + }, + { + "epoch": 2.132934741127083, + "grad_norm": 1.5430344343185425, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8901904225349426, + "num_tokens": 639638785.0, + "step": 16767 + }, + { + "epoch": 2.1330619514056735, + "grad_norm": 1.4993526935577393, + "learning_rate": 1e-06, + "loss": 0.2763, + "mean_token_accuracy": 0.8997572064399719, + "num_tokens": 639676291.0, + "step": 16768 + }, + { + "epoch": 2.133189161684264, + "grad_norm": 1.5320963859558105, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8965644836425781, + "num_tokens": 639713911.0, + "step": 16769 + }, + { + "epoch": 2.1333163719628545, + "grad_norm": 1.4073928594589233, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8874785304069519, + "num_tokens": 639757974.0, + "step": 16770 + }, + { + "epoch": 2.133443582241445, + "grad_norm": 1.6187458038330078, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8913781642913818, + "num_tokens": 639796217.0, + "step": 16771 + }, + { + "epoch": 2.1335707925200356, + "grad_norm": 1.598934531211853, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8875957727432251, + "num_tokens": 639831458.0, + "step": 16772 + }, + { + "epoch": 2.133698002798626, + "grad_norm": 1.5295933485031128, + "learning_rate": 1e-06, + "loss": 0.2887, + "mean_token_accuracy": 0.8953016996383667, + "num_tokens": 639867199.0, + "step": 16773 + }, + { + "epoch": 2.1338252130772166, + "grad_norm": 1.5171340703964233, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8676848411560059, + "num_tokens": 639913418.0, + "step": 16774 + }, + { + "epoch": 2.133952423355807, + "grad_norm": 1.6312569379806519, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8744767308235168, + "num_tokens": 639956567.0, + "step": 16775 + }, + { + "epoch": 2.1340796336343977, + "grad_norm": 1.5021873712539673, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8675563931465149, + "num_tokens": 640000741.0, + "step": 16776 + }, + { + "epoch": 2.134206843912988, + "grad_norm": 1.6089773178100586, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8748009204864502, + "num_tokens": 640040108.0, + "step": 16777 + }, + { + "epoch": 2.1343340541915787, + "grad_norm": 1.599786400794983, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8737168312072754, + "num_tokens": 640080717.0, + "step": 16778 + }, + { + "epoch": 2.1344612644701693, + "grad_norm": 1.5466564893722534, + "learning_rate": 1e-06, + "loss": 0.2531, + "mean_token_accuracy": 0.9082308411598206, + "num_tokens": 640114013.0, + "step": 16779 + }, + { + "epoch": 2.13458847474876, + "grad_norm": 1.6269298791885376, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8836236596107483, + "num_tokens": 640151050.0, + "step": 16780 + }, + { + "epoch": 2.1347156850273503, + "grad_norm": 1.5233683586120605, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8794468641281128, + "num_tokens": 640191004.0, + "step": 16781 + }, + { + "epoch": 2.134842895305941, + "grad_norm": 1.5913879871368408, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8810991048812866, + "num_tokens": 640228748.0, + "step": 16782 + }, + { + "epoch": 2.1349701055845314, + "grad_norm": 1.7201614379882812, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8806813955307007, + "num_tokens": 640264331.0, + "step": 16783 + }, + { + "epoch": 2.135097315863122, + "grad_norm": 1.5289474725723267, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8843746185302734, + "num_tokens": 640306149.0, + "step": 16784 + }, + { + "epoch": 2.1352245261417124, + "grad_norm": 1.5900349617004395, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8787716031074524, + "num_tokens": 640349004.0, + "step": 16785 + }, + { + "epoch": 2.135351736420303, + "grad_norm": 1.7223846912384033, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8736125826835632, + "num_tokens": 640385630.0, + "step": 16786 + }, + { + "epoch": 2.1354789466988935, + "grad_norm": 1.6939237117767334, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8879663944244385, + "num_tokens": 640417622.0, + "step": 16787 + }, + { + "epoch": 2.1356061569774836, + "grad_norm": 1.4511915445327759, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8778074979782104, + "num_tokens": 640460557.0, + "step": 16788 + }, + { + "epoch": 2.135733367256074, + "grad_norm": 1.4921735525131226, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.877037525177002, + "num_tokens": 640501744.0, + "step": 16789 + }, + { + "epoch": 2.1358605775346646, + "grad_norm": 1.6284024715423584, + "learning_rate": 1e-06, + "loss": 0.2771, + "mean_token_accuracy": 0.8982952833175659, + "num_tokens": 640541119.0, + "step": 16790 + }, + { + "epoch": 2.135987787813255, + "grad_norm": 1.6374082565307617, + "learning_rate": 1e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.8970005512237549, + "num_tokens": 640579206.0, + "step": 16791 + }, + { + "epoch": 2.1361149980918457, + "grad_norm": 1.4436665773391724, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.884711503982544, + "num_tokens": 640625919.0, + "step": 16792 + }, + { + "epoch": 2.136242208370436, + "grad_norm": 1.5890010595321655, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8878099918365479, + "num_tokens": 640665926.0, + "step": 16793 + }, + { + "epoch": 2.1363694186490267, + "grad_norm": 1.5627851486206055, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8875537514686584, + "num_tokens": 640706637.0, + "step": 16794 + }, + { + "epoch": 2.1364966289276173, + "grad_norm": 1.6297845840454102, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.886982262134552, + "num_tokens": 640746523.0, + "step": 16795 + }, + { + "epoch": 2.136623839206208, + "grad_norm": 1.4788931608200073, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.882853627204895, + "num_tokens": 640792166.0, + "step": 16796 + }, + { + "epoch": 2.1367510494847983, + "grad_norm": 1.6628679037094116, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8871020674705505, + "num_tokens": 640827633.0, + "step": 16797 + }, + { + "epoch": 2.136878259763389, + "grad_norm": 1.6606495380401611, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8842359185218811, + "num_tokens": 640865727.0, + "step": 16798 + }, + { + "epoch": 2.1370054700419794, + "grad_norm": 1.7309391498565674, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8784327507019043, + "num_tokens": 640900678.0, + "step": 16799 + }, + { + "epoch": 2.13713268032057, + "grad_norm": 1.4599792957305908, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8878900408744812, + "num_tokens": 640944540.0, + "step": 16800 + }, + { + "epoch": 2.1372598905991604, + "grad_norm": 1.4820078611373901, + "learning_rate": 1e-06, + "loss": 0.2595, + "mean_token_accuracy": 0.9069607257843018, + "num_tokens": 640985316.0, + "step": 16801 + }, + { + "epoch": 2.137387100877751, + "grad_norm": 1.6164344549179077, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8958755731582642, + "num_tokens": 641027706.0, + "step": 16802 + }, + { + "epoch": 2.1375143111563415, + "grad_norm": 1.5748850107192993, + "learning_rate": 1e-06, + "loss": 0.2691, + "mean_token_accuracy": 0.9040831923484802, + "num_tokens": 641061902.0, + "step": 16803 + }, + { + "epoch": 2.137641521434932, + "grad_norm": 1.5613031387329102, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8890120983123779, + "num_tokens": 641103497.0, + "step": 16804 + }, + { + "epoch": 2.1377687317135226, + "grad_norm": 1.8075649738311768, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8738263845443726, + "num_tokens": 641136809.0, + "step": 16805 + }, + { + "epoch": 2.137895941992113, + "grad_norm": 1.5412670373916626, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8913671970367432, + "num_tokens": 641175724.0, + "step": 16806 + }, + { + "epoch": 2.1380231522707036, + "grad_norm": 1.6200101375579834, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.881301999092102, + "num_tokens": 641210098.0, + "step": 16807 + }, + { + "epoch": 2.138150362549294, + "grad_norm": 1.5969003438949585, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8901881575584412, + "num_tokens": 641243889.0, + "step": 16808 + }, + { + "epoch": 2.1382775728278847, + "grad_norm": 1.5566493272781372, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8934146761894226, + "num_tokens": 641279838.0, + "step": 16809 + }, + { + "epoch": 2.138404783106475, + "grad_norm": 1.4543039798736572, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.894773006439209, + "num_tokens": 641319375.0, + "step": 16810 + }, + { + "epoch": 2.1385319933850653, + "grad_norm": 1.605373501777649, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8867825865745544, + "num_tokens": 641353785.0, + "step": 16811 + }, + { + "epoch": 2.1386592036636562, + "grad_norm": 1.6416879892349243, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8853036761283875, + "num_tokens": 641388081.0, + "step": 16812 + }, + { + "epoch": 2.1387864139422463, + "grad_norm": 1.551710605621338, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8751633763313293, + "num_tokens": 641428363.0, + "step": 16813 + }, + { + "epoch": 2.138913624220837, + "grad_norm": 1.7638437747955322, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8769134283065796, + "num_tokens": 641460545.0, + "step": 16814 + }, + { + "epoch": 2.1390408344994274, + "grad_norm": 1.6105471849441528, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8765485286712646, + "num_tokens": 641499651.0, + "step": 16815 + }, + { + "epoch": 2.139168044778018, + "grad_norm": 1.4442616701126099, + "learning_rate": 1e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.898252546787262, + "num_tokens": 641539862.0, + "step": 16816 + }, + { + "epoch": 2.1392952550566084, + "grad_norm": 1.643043875694275, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8868395090103149, + "num_tokens": 641575524.0, + "step": 16817 + }, + { + "epoch": 2.139422465335199, + "grad_norm": 1.4946174621582031, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8819670677185059, + "num_tokens": 641621340.0, + "step": 16818 + }, + { + "epoch": 2.1395496756137895, + "grad_norm": 1.6169955730438232, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8815338015556335, + "num_tokens": 641657233.0, + "step": 16819 + }, + { + "epoch": 2.13967688589238, + "grad_norm": 1.5499708652496338, + "learning_rate": 1e-06, + "loss": 0.2743, + "mean_token_accuracy": 0.9019805192947388, + "num_tokens": 641693740.0, + "step": 16820 + }, + { + "epoch": 2.1398040961709706, + "grad_norm": 1.6457470655441284, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8860656023025513, + "num_tokens": 641731798.0, + "step": 16821 + }, + { + "epoch": 2.139931306449561, + "grad_norm": 1.6464078426361084, + "learning_rate": 1e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.8958368301391602, + "num_tokens": 641762911.0, + "step": 16822 + }, + { + "epoch": 2.1400585167281516, + "grad_norm": 1.6030689477920532, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8806952238082886, + "num_tokens": 641802387.0, + "step": 16823 + }, + { + "epoch": 2.140185727006742, + "grad_norm": 1.6237716674804688, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8600225448608398, + "num_tokens": 641846133.0, + "step": 16824 + }, + { + "epoch": 2.1403129372853327, + "grad_norm": 1.6722677946090698, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8782917261123657, + "num_tokens": 641881730.0, + "step": 16825 + }, + { + "epoch": 2.140440147563923, + "grad_norm": 1.5295448303222656, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8890254497528076, + "num_tokens": 641922261.0, + "step": 16826 + }, + { + "epoch": 2.1405673578425137, + "grad_norm": 1.5257060527801514, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8905283212661743, + "num_tokens": 641961297.0, + "step": 16827 + }, + { + "epoch": 2.1406945681211043, + "grad_norm": 1.5963280200958252, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.893379271030426, + "num_tokens": 641999950.0, + "step": 16828 + }, + { + "epoch": 2.140821778399695, + "grad_norm": 1.4175275564193726, + "learning_rate": 1e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.8975704908370972, + "num_tokens": 642041255.0, + "step": 16829 + }, + { + "epoch": 2.1409489886782853, + "grad_norm": 1.5639841556549072, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8877268433570862, + "num_tokens": 642078730.0, + "step": 16830 + }, + { + "epoch": 2.141076198956876, + "grad_norm": 1.5423905849456787, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.8963425159454346, + "num_tokens": 642116062.0, + "step": 16831 + }, + { + "epoch": 2.1412034092354664, + "grad_norm": 1.716068983078003, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8670428395271301, + "num_tokens": 642156642.0, + "step": 16832 + }, + { + "epoch": 2.141330619514057, + "grad_norm": 1.6575634479522705, + "learning_rate": 1e-06, + "loss": 0.2661, + "mean_token_accuracy": 0.904121994972229, + "num_tokens": 642187895.0, + "step": 16833 + }, + { + "epoch": 2.1414578297926474, + "grad_norm": 1.5404813289642334, + "learning_rate": 1e-06, + "loss": 0.2778, + "mean_token_accuracy": 0.8950634002685547, + "num_tokens": 642224162.0, + "step": 16834 + }, + { + "epoch": 2.141585040071238, + "grad_norm": 1.5539581775665283, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8812770843505859, + "num_tokens": 642266474.0, + "step": 16835 + }, + { + "epoch": 2.141712250349828, + "grad_norm": 1.5826008319854736, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.888480544090271, + "num_tokens": 642301416.0, + "step": 16836 + }, + { + "epoch": 2.141839460628419, + "grad_norm": 1.6925759315490723, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8861244916915894, + "num_tokens": 642336235.0, + "step": 16837 + }, + { + "epoch": 2.141966670907009, + "grad_norm": 1.467942237854004, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8940147757530212, + "num_tokens": 642377283.0, + "step": 16838 + }, + { + "epoch": 2.1420938811855996, + "grad_norm": 1.5917109251022339, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8737670183181763, + "num_tokens": 642418336.0, + "step": 16839 + }, + { + "epoch": 2.14222109146419, + "grad_norm": 1.6642643213272095, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8764898180961609, + "num_tokens": 642454480.0, + "step": 16840 + }, + { + "epoch": 2.1423483017427807, + "grad_norm": 1.5488340854644775, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8912662267684937, + "num_tokens": 642493677.0, + "step": 16841 + }, + { + "epoch": 2.142475512021371, + "grad_norm": 1.5683231353759766, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8775710463523865, + "num_tokens": 642535570.0, + "step": 16842 + }, + { + "epoch": 2.1426027222999617, + "grad_norm": 1.5818321704864502, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8769077062606812, + "num_tokens": 642574941.0, + "step": 16843 + }, + { + "epoch": 2.1427299325785523, + "grad_norm": 1.6590299606323242, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8775641918182373, + "num_tokens": 642610069.0, + "step": 16844 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 1.5262449979782104, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.889554500579834, + "num_tokens": 642649156.0, + "step": 16845 + }, + { + "epoch": 2.1429843531357333, + "grad_norm": 1.583724021911621, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8889875411987305, + "num_tokens": 642684643.0, + "step": 16846 + }, + { + "epoch": 2.143111563414324, + "grad_norm": 1.5018565654754639, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.889832615852356, + "num_tokens": 642726955.0, + "step": 16847 + }, + { + "epoch": 2.1432387736929144, + "grad_norm": 1.5103819370269775, + "learning_rate": 1e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.8988542556762695, + "num_tokens": 642763573.0, + "step": 16848 + }, + { + "epoch": 2.143365983971505, + "grad_norm": 1.6047141551971436, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8934218883514404, + "num_tokens": 642798909.0, + "step": 16849 + }, + { + "epoch": 2.1434931942500954, + "grad_norm": 1.6618118286132812, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8829361200332642, + "num_tokens": 642831835.0, + "step": 16850 + }, + { + "epoch": 2.143620404528686, + "grad_norm": 1.6293370723724365, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.8942487835884094, + "num_tokens": 642868207.0, + "step": 16851 + }, + { + "epoch": 2.1437476148072765, + "grad_norm": 1.628753900527954, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8723940849304199, + "num_tokens": 642907689.0, + "step": 16852 + }, + { + "epoch": 2.143874825085867, + "grad_norm": 1.435148000717163, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8843718767166138, + "num_tokens": 642952275.0, + "step": 16853 + }, + { + "epoch": 2.1440020353644575, + "grad_norm": 1.79950749874115, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8895896077156067, + "num_tokens": 642983942.0, + "step": 16854 + }, + { + "epoch": 2.144129245643048, + "grad_norm": 1.4217253923416138, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.8988816738128662, + "num_tokens": 643024912.0, + "step": 16855 + }, + { + "epoch": 2.1442564559216386, + "grad_norm": 1.5705549716949463, + "learning_rate": 1e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.8976471424102783, + "num_tokens": 643059698.0, + "step": 16856 + }, + { + "epoch": 2.144383666200229, + "grad_norm": 1.5980701446533203, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8887330293655396, + "num_tokens": 643096068.0, + "step": 16857 + }, + { + "epoch": 2.1445108764788197, + "grad_norm": 1.5634791851043701, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8807458281517029, + "num_tokens": 643132695.0, + "step": 16858 + }, + { + "epoch": 2.14463808675741, + "grad_norm": 1.720008134841919, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8636220693588257, + "num_tokens": 643171464.0, + "step": 16859 + }, + { + "epoch": 2.1447652970360007, + "grad_norm": 1.4672424793243408, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8923896551132202, + "num_tokens": 643213740.0, + "step": 16860 + }, + { + "epoch": 2.144892507314591, + "grad_norm": 1.5651708841323853, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8837839365005493, + "num_tokens": 643250854.0, + "step": 16861 + }, + { + "epoch": 2.1450197175931813, + "grad_norm": 1.5267888307571411, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8938432931900024, + "num_tokens": 643290215.0, + "step": 16862 + }, + { + "epoch": 2.145146927871772, + "grad_norm": 1.42995285987854, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8911662101745605, + "num_tokens": 643330209.0, + "step": 16863 + }, + { + "epoch": 2.1452741381503624, + "grad_norm": 1.4847731590270996, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8790796399116516, + "num_tokens": 643369432.0, + "step": 16864 + }, + { + "epoch": 2.145401348428953, + "grad_norm": 1.4992934465408325, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8868544697761536, + "num_tokens": 643410468.0, + "step": 16865 + }, + { + "epoch": 2.1455285587075434, + "grad_norm": 1.563849925994873, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8875308036804199, + "num_tokens": 643449450.0, + "step": 16866 + }, + { + "epoch": 2.145655768986134, + "grad_norm": 1.6943395137786865, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8799687027931213, + "num_tokens": 643485897.0, + "step": 16867 + }, + { + "epoch": 2.1457829792647245, + "grad_norm": 1.72073233127594, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8877967000007629, + "num_tokens": 643521284.0, + "step": 16868 + }, + { + "epoch": 2.145910189543315, + "grad_norm": 1.617223858833313, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.872837483882904, + "num_tokens": 643558270.0, + "step": 16869 + }, + { + "epoch": 2.1460373998219056, + "grad_norm": 1.6093741655349731, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8723233938217163, + "num_tokens": 643596225.0, + "step": 16870 + }, + { + "epoch": 2.146164610100496, + "grad_norm": 1.435099482536316, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8935455083847046, + "num_tokens": 643639394.0, + "step": 16871 + }, + { + "epoch": 2.1462918203790866, + "grad_norm": 1.6955006122589111, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8782837390899658, + "num_tokens": 643671745.0, + "step": 16872 + }, + { + "epoch": 2.146419030657677, + "grad_norm": 1.5265592336654663, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8779442310333252, + "num_tokens": 643710085.0, + "step": 16873 + }, + { + "epoch": 2.1465462409362677, + "grad_norm": 1.6147079467773438, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8870288729667664, + "num_tokens": 643747866.0, + "step": 16874 + }, + { + "epoch": 2.146673451214858, + "grad_norm": 1.5462819337844849, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8757522702217102, + "num_tokens": 643787349.0, + "step": 16875 + }, + { + "epoch": 2.1468006614934487, + "grad_norm": 1.542497992515564, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.8947159051895142, + "num_tokens": 643823136.0, + "step": 16876 + }, + { + "epoch": 2.1469278717720393, + "grad_norm": 1.603546380996704, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8925881385803223, + "num_tokens": 643858397.0, + "step": 16877 + }, + { + "epoch": 2.14705508205063, + "grad_norm": 1.5148561000823975, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8765685558319092, + "num_tokens": 643899917.0, + "step": 16878 + }, + { + "epoch": 2.1471822923292203, + "grad_norm": 1.5742496252059937, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8832824230194092, + "num_tokens": 643936898.0, + "step": 16879 + }, + { + "epoch": 2.147309502607811, + "grad_norm": 1.461253046989441, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8915870189666748, + "num_tokens": 643978360.0, + "step": 16880 + }, + { + "epoch": 2.1474367128864014, + "grad_norm": 1.5778536796569824, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8825200796127319, + "num_tokens": 644012590.0, + "step": 16881 + }, + { + "epoch": 2.147563923164992, + "grad_norm": 1.5729808807373047, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8724967241287231, + "num_tokens": 644054581.0, + "step": 16882 + }, + { + "epoch": 2.1476911334435824, + "grad_norm": 1.60739266872406, + "learning_rate": 1e-06, + "loss": 0.2793, + "mean_token_accuracy": 0.8977360725402832, + "num_tokens": 644089744.0, + "step": 16883 + }, + { + "epoch": 2.147818343722173, + "grad_norm": 1.485154628753662, + "learning_rate": 1e-06, + "loss": 0.2839, + "mean_token_accuracy": 0.8968952894210815, + "num_tokens": 644130520.0, + "step": 16884 + }, + { + "epoch": 2.1479455540007635, + "grad_norm": 1.7350839376449585, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8812952041625977, + "num_tokens": 644165440.0, + "step": 16885 + }, + { + "epoch": 2.1480727642793536, + "grad_norm": 1.594993233680725, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8791475296020508, + "num_tokens": 644203620.0, + "step": 16886 + }, + { + "epoch": 2.148199974557944, + "grad_norm": 1.714531660079956, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8856358528137207, + "num_tokens": 644237466.0, + "step": 16887 + }, + { + "epoch": 2.1483271848365346, + "grad_norm": 1.5597889423370361, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8815265893936157, + "num_tokens": 644278790.0, + "step": 16888 + }, + { + "epoch": 2.148454395115125, + "grad_norm": 1.6134907007217407, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.8922252655029297, + "num_tokens": 644316504.0, + "step": 16889 + }, + { + "epoch": 2.1485816053937157, + "grad_norm": 1.5497028827667236, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8907316327095032, + "num_tokens": 644357513.0, + "step": 16890 + }, + { + "epoch": 2.148708815672306, + "grad_norm": 1.69901442527771, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8934195637702942, + "num_tokens": 644391578.0, + "step": 16891 + }, + { + "epoch": 2.1488360259508967, + "grad_norm": 1.5189157724380493, + "learning_rate": 1e-06, + "loss": 0.273, + "mean_token_accuracy": 0.8981736898422241, + "num_tokens": 644429015.0, + "step": 16892 + }, + { + "epoch": 2.1489632362294873, + "grad_norm": 1.4782602787017822, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8905351758003235, + "num_tokens": 644468688.0, + "step": 16893 + }, + { + "epoch": 2.149090446508078, + "grad_norm": 1.5644092559814453, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.872146725654602, + "num_tokens": 644510468.0, + "step": 16894 + }, + { + "epoch": 2.1492176567866683, + "grad_norm": 1.3403546810150146, + "learning_rate": 1e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.8990193605422974, + "num_tokens": 644558085.0, + "step": 16895 + }, + { + "epoch": 2.149344867065259, + "grad_norm": 1.8283913135528564, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8702295422554016, + "num_tokens": 644594409.0, + "step": 16896 + }, + { + "epoch": 2.1494720773438494, + "grad_norm": 1.5585345029830933, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8860828876495361, + "num_tokens": 644635329.0, + "step": 16897 + }, + { + "epoch": 2.14959928762244, + "grad_norm": 1.5067212581634521, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8843812942504883, + "num_tokens": 644677288.0, + "step": 16898 + }, + { + "epoch": 2.1497264979010304, + "grad_norm": 1.4562889337539673, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8806456327438354, + "num_tokens": 644721824.0, + "step": 16899 + }, + { + "epoch": 2.149853708179621, + "grad_norm": 1.5452044010162354, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8726773262023926, + "num_tokens": 644763633.0, + "step": 16900 + }, + { + "epoch": 2.1499809184582115, + "grad_norm": 1.4779009819030762, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8835505247116089, + "num_tokens": 644803894.0, + "step": 16901 + }, + { + "epoch": 2.150108128736802, + "grad_norm": 1.628661036491394, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8818687200546265, + "num_tokens": 644843449.0, + "step": 16902 + }, + { + "epoch": 2.1502353390153925, + "grad_norm": 1.6231223344802856, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8902400135993958, + "num_tokens": 644878047.0, + "step": 16903 + }, + { + "epoch": 2.150362549293983, + "grad_norm": 1.6298364400863647, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8801588416099548, + "num_tokens": 644915981.0, + "step": 16904 + }, + { + "epoch": 2.1504897595725736, + "grad_norm": 1.6684510707855225, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8867799043655396, + "num_tokens": 644948890.0, + "step": 16905 + }, + { + "epoch": 2.150616969851164, + "grad_norm": 1.5912489891052246, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8897702693939209, + "num_tokens": 644983396.0, + "step": 16906 + }, + { + "epoch": 2.1507441801297547, + "grad_norm": 1.660447359085083, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8905196189880371, + "num_tokens": 645016125.0, + "step": 16907 + }, + { + "epoch": 2.150871390408345, + "grad_norm": 1.6192119121551514, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8872376084327698, + "num_tokens": 645050622.0, + "step": 16908 + }, + { + "epoch": 2.1509986006869353, + "grad_norm": 1.7521942853927612, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8827698230743408, + "num_tokens": 645081577.0, + "step": 16909 + }, + { + "epoch": 2.1511258109655262, + "grad_norm": 1.5524102449417114, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8924440145492554, + "num_tokens": 645118896.0, + "step": 16910 + }, + { + "epoch": 2.1512530212441163, + "grad_norm": 1.451579213142395, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.889559805393219, + "num_tokens": 645160613.0, + "step": 16911 + }, + { + "epoch": 2.151380231522707, + "grad_norm": 1.4516559839248657, + "learning_rate": 1e-06, + "loss": 0.2888, + "mean_token_accuracy": 0.8943378925323486, + "num_tokens": 645202111.0, + "step": 16912 + }, + { + "epoch": 2.1515074418012974, + "grad_norm": 1.5650887489318848, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8899956941604614, + "num_tokens": 645242840.0, + "step": 16913 + }, + { + "epoch": 2.151634652079888, + "grad_norm": 1.5581895112991333, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8900165557861328, + "num_tokens": 645282917.0, + "step": 16914 + }, + { + "epoch": 2.1517618623584784, + "grad_norm": 1.566424012184143, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8858251571655273, + "num_tokens": 645321722.0, + "step": 16915 + }, + { + "epoch": 2.151889072637069, + "grad_norm": 1.4692810773849487, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.891264796257019, + "num_tokens": 645360366.0, + "step": 16916 + }, + { + "epoch": 2.1520162829156595, + "grad_norm": 1.4774420261383057, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8890636563301086, + "num_tokens": 645401248.0, + "step": 16917 + }, + { + "epoch": 2.15214349319425, + "grad_norm": 1.581726312637329, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.878007173538208, + "num_tokens": 645441750.0, + "step": 16918 + }, + { + "epoch": 2.1522707034728406, + "grad_norm": 1.6449121236801147, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8623511791229248, + "num_tokens": 645481044.0, + "step": 16919 + }, + { + "epoch": 2.152397913751431, + "grad_norm": 1.8474292755126953, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8785266876220703, + "num_tokens": 645511358.0, + "step": 16920 + }, + { + "epoch": 2.1525251240300216, + "grad_norm": 1.8590890169143677, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8697161674499512, + "num_tokens": 645543801.0, + "step": 16921 + }, + { + "epoch": 2.152652334308612, + "grad_norm": 1.6210744380950928, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8779890537261963, + "num_tokens": 645580796.0, + "step": 16922 + }, + { + "epoch": 2.1527795445872027, + "grad_norm": 1.510070562362671, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8945895433425903, + "num_tokens": 645616933.0, + "step": 16923 + }, + { + "epoch": 2.152906754865793, + "grad_norm": 1.6322168111801147, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8733855485916138, + "num_tokens": 645651932.0, + "step": 16924 + }, + { + "epoch": 2.1530339651443837, + "grad_norm": 1.4814751148223877, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8837515115737915, + "num_tokens": 645692437.0, + "step": 16925 + }, + { + "epoch": 2.1531611754229742, + "grad_norm": 1.5145784616470337, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8854376673698425, + "num_tokens": 645731852.0, + "step": 16926 + }, + { + "epoch": 2.1532883857015648, + "grad_norm": 1.6796014308929443, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8859161138534546, + "num_tokens": 645765207.0, + "step": 16927 + }, + { + "epoch": 2.1534155959801553, + "grad_norm": 1.4802144765853882, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.890972375869751, + "num_tokens": 645805236.0, + "step": 16928 + }, + { + "epoch": 2.153542806258746, + "grad_norm": 1.531390905380249, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8786578178405762, + "num_tokens": 645847128.0, + "step": 16929 + }, + { + "epoch": 2.1536700165373364, + "grad_norm": 1.6607325077056885, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8780001401901245, + "num_tokens": 645884067.0, + "step": 16930 + }, + { + "epoch": 2.153797226815927, + "grad_norm": 1.4726791381835938, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.893408477306366, + "num_tokens": 645925533.0, + "step": 16931 + }, + { + "epoch": 2.1539244370945174, + "grad_norm": 1.4661425352096558, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8948972225189209, + "num_tokens": 645967318.0, + "step": 16932 + }, + { + "epoch": 2.154051647373108, + "grad_norm": 1.636350393295288, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8838263750076294, + "num_tokens": 646000460.0, + "step": 16933 + }, + { + "epoch": 2.154178857651698, + "grad_norm": 1.5053915977478027, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8781823515892029, + "num_tokens": 646043689.0, + "step": 16934 + }, + { + "epoch": 2.1543060679302886, + "grad_norm": 1.5823684930801392, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8797908425331116, + "num_tokens": 646083237.0, + "step": 16935 + }, + { + "epoch": 2.154433278208879, + "grad_norm": 1.835719347000122, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8648008108139038, + "num_tokens": 646115576.0, + "step": 16936 + }, + { + "epoch": 2.1545604884874696, + "grad_norm": 1.4743691682815552, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8811100721359253, + "num_tokens": 646157958.0, + "step": 16937 + }, + { + "epoch": 2.15468769876606, + "grad_norm": 1.5257152318954468, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8799198269844055, + "num_tokens": 646200171.0, + "step": 16938 + }, + { + "epoch": 2.1548149090446507, + "grad_norm": 1.4854117631912231, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8864115476608276, + "num_tokens": 646243880.0, + "step": 16939 + }, + { + "epoch": 2.154942119323241, + "grad_norm": 1.6178244352340698, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8819663524627686, + "num_tokens": 646279339.0, + "step": 16940 + }, + { + "epoch": 2.1550693296018317, + "grad_norm": 1.547784686088562, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8857976794242859, + "num_tokens": 646316773.0, + "step": 16941 + }, + { + "epoch": 2.1551965398804223, + "grad_norm": 1.5669726133346558, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8879553079605103, + "num_tokens": 646356468.0, + "step": 16942 + }, + { + "epoch": 2.155323750159013, + "grad_norm": 1.6049492359161377, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8860878944396973, + "num_tokens": 646394858.0, + "step": 16943 + }, + { + "epoch": 2.1554509604376033, + "grad_norm": 1.4444999694824219, + "learning_rate": 1e-06, + "loss": 0.2831, + "mean_token_accuracy": 0.8931831121444702, + "num_tokens": 646433513.0, + "step": 16944 + }, + { + "epoch": 2.155578170716194, + "grad_norm": 1.4900425672531128, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.8937543630599976, + "num_tokens": 646474645.0, + "step": 16945 + }, + { + "epoch": 2.1557053809947844, + "grad_norm": 1.6988621950149536, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8915712833404541, + "num_tokens": 646505953.0, + "step": 16946 + }, + { + "epoch": 2.155832591273375, + "grad_norm": 1.4512735605239868, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.8944453001022339, + "num_tokens": 646544187.0, + "step": 16947 + }, + { + "epoch": 2.1559598015519654, + "grad_norm": 1.6622543334960938, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8941532969474792, + "num_tokens": 646576774.0, + "step": 16948 + }, + { + "epoch": 2.156087011830556, + "grad_norm": 1.6760361194610596, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8825207948684692, + "num_tokens": 646610525.0, + "step": 16949 + }, + { + "epoch": 2.1562142221091465, + "grad_norm": 1.6344873905181885, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8836864829063416, + "num_tokens": 646644677.0, + "step": 16950 + }, + { + "epoch": 2.156341432387737, + "grad_norm": 1.4927363395690918, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8932490944862366, + "num_tokens": 646683525.0, + "step": 16951 + }, + { + "epoch": 2.1564686426663275, + "grad_norm": 1.668244481086731, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8873793482780457, + "num_tokens": 646717656.0, + "step": 16952 + }, + { + "epoch": 2.156595852944918, + "grad_norm": 1.596553921699524, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8900236487388611, + "num_tokens": 646754223.0, + "step": 16953 + }, + { + "epoch": 2.1567230632235086, + "grad_norm": 1.8111580610275269, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8926093578338623, + "num_tokens": 646787523.0, + "step": 16954 + }, + { + "epoch": 2.156850273502099, + "grad_norm": 1.431769609451294, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.8930801749229431, + "num_tokens": 646829525.0, + "step": 16955 + }, + { + "epoch": 2.1569774837806897, + "grad_norm": 1.612318992614746, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8921591639518738, + "num_tokens": 646864831.0, + "step": 16956 + }, + { + "epoch": 2.15710469405928, + "grad_norm": 1.6032215356826782, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8869749307632446, + "num_tokens": 646903983.0, + "step": 16957 + }, + { + "epoch": 2.1572319043378707, + "grad_norm": 1.58805251121521, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8919771313667297, + "num_tokens": 646941366.0, + "step": 16958 + }, + { + "epoch": 2.157359114616461, + "grad_norm": 1.5393867492675781, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8866081237792969, + "num_tokens": 646983976.0, + "step": 16959 + }, + { + "epoch": 2.1574863248950513, + "grad_norm": 2.011178493499756, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8737908601760864, + "num_tokens": 647012858.0, + "step": 16960 + }, + { + "epoch": 2.157613535173642, + "grad_norm": 1.5007433891296387, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8793743848800659, + "num_tokens": 647052953.0, + "step": 16961 + }, + { + "epoch": 2.1577407454522324, + "grad_norm": 1.4570003747940063, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8951143026351929, + "num_tokens": 647098206.0, + "step": 16962 + }, + { + "epoch": 2.157867955730823, + "grad_norm": 1.593734622001648, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.897558331489563, + "num_tokens": 647133242.0, + "step": 16963 + }, + { + "epoch": 2.1579951660094134, + "grad_norm": 1.5732483863830566, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8867924213409424, + "num_tokens": 647173741.0, + "step": 16964 + }, + { + "epoch": 2.158122376288004, + "grad_norm": 1.7020928859710693, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8862696290016174, + "num_tokens": 647208210.0, + "step": 16965 + }, + { + "epoch": 2.1582495865665945, + "grad_norm": 1.5559983253479004, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8875473141670227, + "num_tokens": 647250903.0, + "step": 16966 + }, + { + "epoch": 2.158376796845185, + "grad_norm": 1.585130214691162, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8767327666282654, + "num_tokens": 647288425.0, + "step": 16967 + }, + { + "epoch": 2.1585040071237755, + "grad_norm": 1.5232495069503784, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8757985830307007, + "num_tokens": 647332550.0, + "step": 16968 + }, + { + "epoch": 2.158631217402366, + "grad_norm": 1.615860104560852, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8918712139129639, + "num_tokens": 647367449.0, + "step": 16969 + }, + { + "epoch": 2.1587584276809566, + "grad_norm": 1.606429100036621, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8840377330780029, + "num_tokens": 647406049.0, + "step": 16970 + }, + { + "epoch": 2.158885637959547, + "grad_norm": 1.5261685848236084, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8856570720672607, + "num_tokens": 647446011.0, + "step": 16971 + }, + { + "epoch": 2.1590128482381377, + "grad_norm": 1.636961579322815, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8891029357910156, + "num_tokens": 647480260.0, + "step": 16972 + }, + { + "epoch": 2.159140058516728, + "grad_norm": 1.5518975257873535, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8759312629699707, + "num_tokens": 647521318.0, + "step": 16973 + }, + { + "epoch": 2.1592672687953187, + "grad_norm": 1.527254343032837, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8763310313224792, + "num_tokens": 647560417.0, + "step": 16974 + }, + { + "epoch": 2.1593944790739092, + "grad_norm": 1.3836426734924316, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8889783620834351, + "num_tokens": 647607083.0, + "step": 16975 + }, + { + "epoch": 2.1595216893524998, + "grad_norm": 1.8935080766677856, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8840753436088562, + "num_tokens": 647636906.0, + "step": 16976 + }, + { + "epoch": 2.1596488996310903, + "grad_norm": 1.4771808385849, + "learning_rate": 1e-06, + "loss": 0.2888, + "mean_token_accuracy": 0.8944919109344482, + "num_tokens": 647677435.0, + "step": 16977 + }, + { + "epoch": 2.159776109909681, + "grad_norm": 1.5637445449829102, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8801766633987427, + "num_tokens": 647715539.0, + "step": 16978 + }, + { + "epoch": 2.1599033201882714, + "grad_norm": 1.80348801612854, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.868111789226532, + "num_tokens": 647749070.0, + "step": 16979 + }, + { + "epoch": 2.160030530466862, + "grad_norm": 1.4860236644744873, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8969408273696899, + "num_tokens": 647788323.0, + "step": 16980 + }, + { + "epoch": 2.1601577407454524, + "grad_norm": 1.6888396739959717, + "learning_rate": 1e-06, + "loss": 0.2902, + "mean_token_accuracy": 0.8925726413726807, + "num_tokens": 647819336.0, + "step": 16981 + }, + { + "epoch": 2.160284951024043, + "grad_norm": 1.6235897541046143, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8868182897567749, + "num_tokens": 647857801.0, + "step": 16982 + }, + { + "epoch": 2.1604121613026335, + "grad_norm": 1.710367202758789, + "learning_rate": 1e-06, + "loss": 0.2763, + "mean_token_accuracy": 0.9035463929176331, + "num_tokens": 647892055.0, + "step": 16983 + }, + { + "epoch": 2.1605393715812236, + "grad_norm": 1.6162608861923218, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8773870468139648, + "num_tokens": 647931417.0, + "step": 16984 + }, + { + "epoch": 2.160666581859814, + "grad_norm": 1.6682634353637695, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.892334520816803, + "num_tokens": 647964677.0, + "step": 16985 + }, + { + "epoch": 2.1607937921384046, + "grad_norm": 1.9556950330734253, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8876363635063171, + "num_tokens": 647999338.0, + "step": 16986 + }, + { + "epoch": 2.160921002416995, + "grad_norm": 1.6302825212478638, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8735597133636475, + "num_tokens": 648040944.0, + "step": 16987 + }, + { + "epoch": 2.1610482126955857, + "grad_norm": 1.5868539810180664, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8815127015113831, + "num_tokens": 648078983.0, + "step": 16988 + }, + { + "epoch": 2.161175422974176, + "grad_norm": 1.4760379791259766, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8706245422363281, + "num_tokens": 648123165.0, + "step": 16989 + }, + { + "epoch": 2.1613026332527667, + "grad_norm": 1.595450520515442, + "learning_rate": 1e-06, + "loss": 0.267, + "mean_token_accuracy": 0.9042778015136719, + "num_tokens": 648157466.0, + "step": 16990 + }, + { + "epoch": 2.1614298435313573, + "grad_norm": 1.4198718070983887, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8909414410591125, + "num_tokens": 648200899.0, + "step": 16991 + }, + { + "epoch": 2.161557053809948, + "grad_norm": 1.5278483629226685, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8778760433197021, + "num_tokens": 648240267.0, + "step": 16992 + }, + { + "epoch": 2.1616842640885383, + "grad_norm": 1.6010650396347046, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8918174505233765, + "num_tokens": 648275837.0, + "step": 16993 + }, + { + "epoch": 2.161811474367129, + "grad_norm": 1.674088716506958, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8802287578582764, + "num_tokens": 648311170.0, + "step": 16994 + }, + { + "epoch": 2.1619386846457194, + "grad_norm": 1.5341182947158813, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8813536763191223, + "num_tokens": 648350045.0, + "step": 16995 + }, + { + "epoch": 2.16206589492431, + "grad_norm": 1.5231705904006958, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8636964559555054, + "num_tokens": 648391297.0, + "step": 16996 + }, + { + "epoch": 2.1621931052029004, + "grad_norm": 1.4173675775527954, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8833139538764954, + "num_tokens": 648434167.0, + "step": 16997 + }, + { + "epoch": 2.162320315481491, + "grad_norm": 1.7475303411483765, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8796682357788086, + "num_tokens": 648467693.0, + "step": 16998 + }, + { + "epoch": 2.1624475257600815, + "grad_norm": 1.5736178159713745, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8899956345558167, + "num_tokens": 648508538.0, + "step": 16999 + }, + { + "epoch": 2.162574736038672, + "grad_norm": 1.5884418487548828, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8952115774154663, + "num_tokens": 648545024.0, + "step": 17000 + }, + { + "epoch": 2.1627019463172625, + "grad_norm": 1.6234180927276611, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8875911235809326, + "num_tokens": 648584865.0, + "step": 17001 + }, + { + "epoch": 2.162829156595853, + "grad_norm": 1.4645140171051025, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8948192596435547, + "num_tokens": 648625832.0, + "step": 17002 + }, + { + "epoch": 2.1629563668744436, + "grad_norm": 1.5791561603546143, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.886158287525177, + "num_tokens": 648661182.0, + "step": 17003 + }, + { + "epoch": 2.163083577153034, + "grad_norm": 1.816296935081482, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8851107358932495, + "num_tokens": 648694748.0, + "step": 17004 + }, + { + "epoch": 2.1632107874316246, + "grad_norm": 1.63667893409729, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8782203197479248, + "num_tokens": 648733219.0, + "step": 17005 + }, + { + "epoch": 2.163337997710215, + "grad_norm": 1.4752607345581055, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8830175399780273, + "num_tokens": 648774955.0, + "step": 17006 + }, + { + "epoch": 2.1634652079888053, + "grad_norm": 1.4960508346557617, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8923496007919312, + "num_tokens": 648812363.0, + "step": 17007 + }, + { + "epoch": 2.1635924182673962, + "grad_norm": 1.6350973844528198, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8939428925514221, + "num_tokens": 648844203.0, + "step": 17008 + }, + { + "epoch": 2.1637196285459863, + "grad_norm": 1.527809977531433, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8818684220314026, + "num_tokens": 648885415.0, + "step": 17009 + }, + { + "epoch": 2.163846838824577, + "grad_norm": 1.4628245830535889, + "learning_rate": 1e-06, + "loss": 0.2788, + "mean_token_accuracy": 0.8973878622055054, + "num_tokens": 648925028.0, + "step": 17010 + }, + { + "epoch": 2.1639740491031674, + "grad_norm": 1.3979865312576294, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8803001642227173, + "num_tokens": 648968507.0, + "step": 17011 + }, + { + "epoch": 2.164101259381758, + "grad_norm": 1.491302490234375, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8894136548042297, + "num_tokens": 649006787.0, + "step": 17012 + }, + { + "epoch": 2.1642284696603484, + "grad_norm": 1.5886855125427246, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8920366764068604, + "num_tokens": 649042303.0, + "step": 17013 + }, + { + "epoch": 2.164355679938939, + "grad_norm": 1.5005360841751099, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8872092962265015, + "num_tokens": 649081038.0, + "step": 17014 + }, + { + "epoch": 2.1644828902175295, + "grad_norm": 1.6843596696853638, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8763917684555054, + "num_tokens": 649121450.0, + "step": 17015 + }, + { + "epoch": 2.16461010049612, + "grad_norm": 1.5335924625396729, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8833159804344177, + "num_tokens": 649159080.0, + "step": 17016 + }, + { + "epoch": 2.1647373107747105, + "grad_norm": 1.779509425163269, + "learning_rate": 1e-06, + "loss": 0.2817, + "mean_token_accuracy": 0.8976938128471375, + "num_tokens": 649190084.0, + "step": 17017 + }, + { + "epoch": 2.164864521053301, + "grad_norm": 1.8450515270233154, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8834896087646484, + "num_tokens": 649219087.0, + "step": 17018 + }, + { + "epoch": 2.1649917313318916, + "grad_norm": 1.5535706281661987, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8921482563018799, + "num_tokens": 649254639.0, + "step": 17019 + }, + { + "epoch": 2.165118941610482, + "grad_norm": 1.6444511413574219, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8873158693313599, + "num_tokens": 649290140.0, + "step": 17020 + }, + { + "epoch": 2.1652461518890727, + "grad_norm": 1.6081470251083374, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8898700475692749, + "num_tokens": 649323938.0, + "step": 17021 + }, + { + "epoch": 2.165373362167663, + "grad_norm": 1.3973608016967773, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8942847847938538, + "num_tokens": 649367579.0, + "step": 17022 + }, + { + "epoch": 2.1655005724462537, + "grad_norm": 1.5771054029464722, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8846701979637146, + "num_tokens": 649404890.0, + "step": 17023 + }, + { + "epoch": 2.1656277827248442, + "grad_norm": 1.4857206344604492, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8840343952178955, + "num_tokens": 649447221.0, + "step": 17024 + }, + { + "epoch": 2.1657549930034348, + "grad_norm": 1.6406711339950562, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8838014602661133, + "num_tokens": 649487465.0, + "step": 17025 + }, + { + "epoch": 2.1658822032820253, + "grad_norm": 1.582028865814209, + "learning_rate": 1e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.8967937231063843, + "num_tokens": 649521489.0, + "step": 17026 + }, + { + "epoch": 2.166009413560616, + "grad_norm": 1.5456063747406006, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.885104238986969, + "num_tokens": 649558363.0, + "step": 17027 + }, + { + "epoch": 2.1661366238392064, + "grad_norm": 1.54213547706604, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8868886828422546, + "num_tokens": 649597285.0, + "step": 17028 + }, + { + "epoch": 2.166263834117797, + "grad_norm": 1.5517522096633911, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8914990425109863, + "num_tokens": 649634933.0, + "step": 17029 + }, + { + "epoch": 2.1663910443963874, + "grad_norm": 1.6008715629577637, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8902866840362549, + "num_tokens": 649672513.0, + "step": 17030 + }, + { + "epoch": 2.166518254674978, + "grad_norm": 1.7200571298599243, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8822392821311951, + "num_tokens": 649705505.0, + "step": 17031 + }, + { + "epoch": 2.166645464953568, + "grad_norm": 1.5720173120498657, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8905655741691589, + "num_tokens": 649745020.0, + "step": 17032 + }, + { + "epoch": 2.1667726752321586, + "grad_norm": 1.520891547203064, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8841880559921265, + "num_tokens": 649789142.0, + "step": 17033 + }, + { + "epoch": 2.166899885510749, + "grad_norm": 1.8348942995071411, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8781447410583496, + "num_tokens": 649820917.0, + "step": 17034 + }, + { + "epoch": 2.1670270957893396, + "grad_norm": 1.4445525407791138, + "learning_rate": 1e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.8933268785476685, + "num_tokens": 649862745.0, + "step": 17035 + }, + { + "epoch": 2.16715430606793, + "grad_norm": 1.6443475484848022, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8730543255805969, + "num_tokens": 649899562.0, + "step": 17036 + }, + { + "epoch": 2.1672815163465207, + "grad_norm": 1.747257113456726, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8800057172775269, + "num_tokens": 649935023.0, + "step": 17037 + }, + { + "epoch": 2.167408726625111, + "grad_norm": 1.5383837223052979, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8768118023872375, + "num_tokens": 649971986.0, + "step": 17038 + }, + { + "epoch": 2.1675359369037017, + "grad_norm": 1.486007809638977, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8674466609954834, + "num_tokens": 650016623.0, + "step": 17039 + }, + { + "epoch": 2.1676631471822922, + "grad_norm": 1.4794102907180786, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8979405164718628, + "num_tokens": 650060768.0, + "step": 17040 + }, + { + "epoch": 2.1677903574608828, + "grad_norm": 1.5086103677749634, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8898969888687134, + "num_tokens": 650100339.0, + "step": 17041 + }, + { + "epoch": 2.1679175677394733, + "grad_norm": 1.699073076248169, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8848930597305298, + "num_tokens": 650133694.0, + "step": 17042 + }, + { + "epoch": 2.168044778018064, + "grad_norm": 1.513818383216858, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8939576745033264, + "num_tokens": 650170405.0, + "step": 17043 + }, + { + "epoch": 2.1681719882966544, + "grad_norm": 1.4648725986480713, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8881300687789917, + "num_tokens": 650210366.0, + "step": 17044 + }, + { + "epoch": 2.168299198575245, + "grad_norm": 1.50075364112854, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8863794803619385, + "num_tokens": 650253170.0, + "step": 17045 + }, + { + "epoch": 2.1684264088538354, + "grad_norm": 1.5646334886550903, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8859001994132996, + "num_tokens": 650294940.0, + "step": 17046 + }, + { + "epoch": 2.168553619132426, + "grad_norm": 1.7558952569961548, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8768413066864014, + "num_tokens": 650327317.0, + "step": 17047 + }, + { + "epoch": 2.1686808294110165, + "grad_norm": 1.6807620525360107, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8869131207466125, + "num_tokens": 650359235.0, + "step": 17048 + }, + { + "epoch": 2.168808039689607, + "grad_norm": 1.5752860307693481, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8775903582572937, + "num_tokens": 650400157.0, + "step": 17049 + }, + { + "epoch": 2.1689352499681975, + "grad_norm": 1.6280624866485596, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.892470121383667, + "num_tokens": 650438514.0, + "step": 17050 + }, + { + "epoch": 2.169062460246788, + "grad_norm": 1.6846532821655273, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8842494487762451, + "num_tokens": 650476531.0, + "step": 17051 + }, + { + "epoch": 2.1691896705253786, + "grad_norm": 1.7647509574890137, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8615949153900146, + "num_tokens": 650514530.0, + "step": 17052 + }, + { + "epoch": 2.169316880803969, + "grad_norm": 1.5687263011932373, + "learning_rate": 1e-06, + "loss": 0.285, + "mean_token_accuracy": 0.8964263200759888, + "num_tokens": 650549698.0, + "step": 17053 + }, + { + "epoch": 2.1694440910825596, + "grad_norm": 1.5571812391281128, + "learning_rate": 1e-06, + "loss": 0.2802, + "mean_token_accuracy": 0.8988311290740967, + "num_tokens": 650587263.0, + "step": 17054 + }, + { + "epoch": 2.16957130136115, + "grad_norm": 1.4343535900115967, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8891838192939758, + "num_tokens": 650630000.0, + "step": 17055 + }, + { + "epoch": 2.1696985116397407, + "grad_norm": 1.5443296432495117, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8795502185821533, + "num_tokens": 650672865.0, + "step": 17056 + }, + { + "epoch": 2.169825721918331, + "grad_norm": 1.6133590936660767, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.888544499874115, + "num_tokens": 650711928.0, + "step": 17057 + }, + { + "epoch": 2.1699529321969213, + "grad_norm": 1.559145212173462, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8835246562957764, + "num_tokens": 650750169.0, + "step": 17058 + }, + { + "epoch": 2.170080142475512, + "grad_norm": 1.5185518264770508, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8835146427154541, + "num_tokens": 650792400.0, + "step": 17059 + }, + { + "epoch": 2.1702073527541024, + "grad_norm": 1.5184719562530518, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8915784358978271, + "num_tokens": 650831320.0, + "step": 17060 + }, + { + "epoch": 2.170334563032693, + "grad_norm": 1.681240439414978, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8907939791679382, + "num_tokens": 650862864.0, + "step": 17061 + }, + { + "epoch": 2.1704617733112834, + "grad_norm": 1.454164743423462, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8757615089416504, + "num_tokens": 650907542.0, + "step": 17062 + }, + { + "epoch": 2.170588983589874, + "grad_norm": 1.5315133333206177, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8698054552078247, + "num_tokens": 650951097.0, + "step": 17063 + }, + { + "epoch": 2.1707161938684645, + "grad_norm": 1.5090852975845337, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8884166479110718, + "num_tokens": 650987702.0, + "step": 17064 + }, + { + "epoch": 2.170843404147055, + "grad_norm": 1.6141674518585205, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8853169083595276, + "num_tokens": 651024668.0, + "step": 17065 + }, + { + "epoch": 2.1709706144256455, + "grad_norm": 1.548067331314087, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8896416425704956, + "num_tokens": 651061687.0, + "step": 17066 + }, + { + "epoch": 2.171097824704236, + "grad_norm": 1.643958330154419, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.894691526889801, + "num_tokens": 651100094.0, + "step": 17067 + }, + { + "epoch": 2.1712250349828266, + "grad_norm": 1.4441479444503784, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8867433667182922, + "num_tokens": 651142935.0, + "step": 17068 + }, + { + "epoch": 2.171352245261417, + "grad_norm": 1.46405029296875, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8847030401229858, + "num_tokens": 651184280.0, + "step": 17069 + }, + { + "epoch": 2.1714794555400077, + "grad_norm": 1.5456831455230713, + "learning_rate": 1e-06, + "loss": 0.274, + "mean_token_accuracy": 0.8987302780151367, + "num_tokens": 651220808.0, + "step": 17070 + }, + { + "epoch": 2.171606665818598, + "grad_norm": 1.7212458848953247, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8709022998809814, + "num_tokens": 651255666.0, + "step": 17071 + }, + { + "epoch": 2.1717338760971887, + "grad_norm": 1.5130687952041626, + "learning_rate": 1e-06, + "loss": 0.2735, + "mean_token_accuracy": 0.9029577970504761, + "num_tokens": 651290912.0, + "step": 17072 + }, + { + "epoch": 2.1718610863757792, + "grad_norm": 1.528286099433899, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8838648200035095, + "num_tokens": 651330778.0, + "step": 17073 + }, + { + "epoch": 2.1719882966543698, + "grad_norm": 1.8827924728393555, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8859478235244751, + "num_tokens": 651361329.0, + "step": 17074 + }, + { + "epoch": 2.1721155069329603, + "grad_norm": 1.6235320568084717, + "learning_rate": 1e-06, + "loss": 0.2831, + "mean_token_accuracy": 0.8980411291122437, + "num_tokens": 651393546.0, + "step": 17075 + }, + { + "epoch": 2.172242717211551, + "grad_norm": 1.6485403776168823, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8815000057220459, + "num_tokens": 651429976.0, + "step": 17076 + }, + { + "epoch": 2.1723699274901414, + "grad_norm": 1.7751708030700684, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8899244070053101, + "num_tokens": 651465460.0, + "step": 17077 + }, + { + "epoch": 2.172497137768732, + "grad_norm": 1.6329995393753052, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8903948068618774, + "num_tokens": 651500091.0, + "step": 17078 + }, + { + "epoch": 2.1726243480473224, + "grad_norm": 1.435062289237976, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.891664445400238, + "num_tokens": 651541785.0, + "step": 17079 + }, + { + "epoch": 2.172751558325913, + "grad_norm": 1.5609147548675537, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8916864395141602, + "num_tokens": 651578948.0, + "step": 17080 + }, + { + "epoch": 2.1728787686045035, + "grad_norm": 1.6216508150100708, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8879309892654419, + "num_tokens": 651614435.0, + "step": 17081 + }, + { + "epoch": 2.1730059788830935, + "grad_norm": 1.6105852127075195, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8861873149871826, + "num_tokens": 651654107.0, + "step": 17082 + }, + { + "epoch": 2.173133189161684, + "grad_norm": 1.602293848991394, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8732364177703857, + "num_tokens": 651693259.0, + "step": 17083 + }, + { + "epoch": 2.1732603994402746, + "grad_norm": 1.498132348060608, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8877047300338745, + "num_tokens": 651734535.0, + "step": 17084 + }, + { + "epoch": 2.173387609718865, + "grad_norm": 1.545598030090332, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8712074160575867, + "num_tokens": 651775539.0, + "step": 17085 + }, + { + "epoch": 2.1735148199974557, + "grad_norm": 1.6067146062850952, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8787564635276794, + "num_tokens": 651810869.0, + "step": 17086 + }, + { + "epoch": 2.173642030276046, + "grad_norm": 1.6504533290863037, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8831265568733215, + "num_tokens": 651846285.0, + "step": 17087 + }, + { + "epoch": 2.1737692405546367, + "grad_norm": 1.3753682374954224, + "learning_rate": 1e-06, + "loss": 0.281, + "mean_token_accuracy": 0.8986413478851318, + "num_tokens": 651888825.0, + "step": 17088 + }, + { + "epoch": 2.1738964508332272, + "grad_norm": 1.6512608528137207, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8764738440513611, + "num_tokens": 651927874.0, + "step": 17089 + }, + { + "epoch": 2.1740236611118178, + "grad_norm": 1.4618810415267944, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8833968639373779, + "num_tokens": 651969294.0, + "step": 17090 + }, + { + "epoch": 2.1741508713904083, + "grad_norm": 1.5107274055480957, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8835257291793823, + "num_tokens": 652015123.0, + "step": 17091 + }, + { + "epoch": 2.174278081668999, + "grad_norm": 1.6301498413085938, + "learning_rate": 1e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.8975821733474731, + "num_tokens": 652050261.0, + "step": 17092 + }, + { + "epoch": 2.1744052919475894, + "grad_norm": 1.6583094596862793, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8754114508628845, + "num_tokens": 652088228.0, + "step": 17093 + }, + { + "epoch": 2.17453250222618, + "grad_norm": 1.528838872909546, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.880913257598877, + "num_tokens": 652128273.0, + "step": 17094 + }, + { + "epoch": 2.1746597125047704, + "grad_norm": 1.5024994611740112, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8788854479789734, + "num_tokens": 652169022.0, + "step": 17095 + }, + { + "epoch": 2.174786922783361, + "grad_norm": 1.6097116470336914, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8680695295333862, + "num_tokens": 652206735.0, + "step": 17096 + }, + { + "epoch": 2.1749141330619515, + "grad_norm": 1.5394923686981201, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8793288469314575, + "num_tokens": 652247060.0, + "step": 17097 + }, + { + "epoch": 2.175041343340542, + "grad_norm": 1.6322996616363525, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8726322650909424, + "num_tokens": 652287987.0, + "step": 17098 + }, + { + "epoch": 2.1751685536191325, + "grad_norm": 1.5528922080993652, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8881677985191345, + "num_tokens": 652325947.0, + "step": 17099 + }, + { + "epoch": 2.175295763897723, + "grad_norm": 1.4794397354125977, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8856441378593445, + "num_tokens": 652366705.0, + "step": 17100 + }, + { + "epoch": 2.1754229741763136, + "grad_norm": 1.674545168876648, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.881661593914032, + "num_tokens": 652401139.0, + "step": 17101 + }, + { + "epoch": 2.175550184454904, + "grad_norm": 1.6156238317489624, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8833518624305725, + "num_tokens": 652438273.0, + "step": 17102 + }, + { + "epoch": 2.1756773947334946, + "grad_norm": 1.5757567882537842, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8842563629150391, + "num_tokens": 652480146.0, + "step": 17103 + }, + { + "epoch": 2.175804605012085, + "grad_norm": 1.5048619508743286, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8850929141044617, + "num_tokens": 652518871.0, + "step": 17104 + }, + { + "epoch": 2.1759318152906753, + "grad_norm": 1.4437479972839355, + "learning_rate": 1e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.8949972987174988, + "num_tokens": 652559602.0, + "step": 17105 + }, + { + "epoch": 2.1760590255692662, + "grad_norm": 1.5547033548355103, + "learning_rate": 1e-06, + "loss": 0.2608, + "mean_token_accuracy": 0.9017895460128784, + "num_tokens": 652595115.0, + "step": 17106 + }, + { + "epoch": 2.1761862358478563, + "grad_norm": 1.6434412002563477, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8710529804229736, + "num_tokens": 652633926.0, + "step": 17107 + }, + { + "epoch": 2.176313446126447, + "grad_norm": 1.8617359399795532, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8669477105140686, + "num_tokens": 652667092.0, + "step": 17108 + }, + { + "epoch": 2.1764406564050374, + "grad_norm": 1.4648075103759766, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8701965808868408, + "num_tokens": 652712002.0, + "step": 17109 + }, + { + "epoch": 2.176567866683628, + "grad_norm": 1.5457087755203247, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8885799646377563, + "num_tokens": 652753096.0, + "step": 17110 + }, + { + "epoch": 2.1766950769622184, + "grad_norm": 1.6344267129898071, + "learning_rate": 1e-06, + "loss": 0.2731, + "mean_token_accuracy": 0.90081387758255, + "num_tokens": 652786454.0, + "step": 17111 + }, + { + "epoch": 2.176822287240809, + "grad_norm": 1.5906959772109985, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8936654925346375, + "num_tokens": 652823482.0, + "step": 17112 + }, + { + "epoch": 2.1769494975193995, + "grad_norm": 1.5468051433563232, + "learning_rate": 1e-06, + "loss": 0.2643, + "mean_token_accuracy": 0.9026862382888794, + "num_tokens": 652856571.0, + "step": 17113 + }, + { + "epoch": 2.17707670779799, + "grad_norm": 1.7875080108642578, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8734996318817139, + "num_tokens": 652890819.0, + "step": 17114 + }, + { + "epoch": 2.1772039180765805, + "grad_norm": 1.5393370389938354, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8895468711853027, + "num_tokens": 652929430.0, + "step": 17115 + }, + { + "epoch": 2.177331128355171, + "grad_norm": 1.5205174684524536, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8810091614723206, + "num_tokens": 652970747.0, + "step": 17116 + }, + { + "epoch": 2.1774583386337616, + "grad_norm": 1.5874601602554321, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.8938735723495483, + "num_tokens": 653004711.0, + "step": 17117 + }, + { + "epoch": 2.177585548912352, + "grad_norm": 1.585368275642395, + "learning_rate": 1e-06, + "loss": 0.2846, + "mean_token_accuracy": 0.897869884967804, + "num_tokens": 653041748.0, + "step": 17118 + }, + { + "epoch": 2.1777127591909426, + "grad_norm": 1.6100465059280396, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8906890153884888, + "num_tokens": 653080523.0, + "step": 17119 + }, + { + "epoch": 2.177839969469533, + "grad_norm": 1.7357113361358643, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.887721836566925, + "num_tokens": 653113378.0, + "step": 17120 + }, + { + "epoch": 2.1779671797481237, + "grad_norm": 1.5904866456985474, + "learning_rate": 1e-06, + "loss": 0.292, + "mean_token_accuracy": 0.8920183181762695, + "num_tokens": 653148335.0, + "step": 17121 + }, + { + "epoch": 2.1780943900267142, + "grad_norm": 1.3766074180603027, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8889114856719971, + "num_tokens": 653193646.0, + "step": 17122 + }, + { + "epoch": 2.1782216003053048, + "grad_norm": 1.6268491744995117, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8731322288513184, + "num_tokens": 653232572.0, + "step": 17123 + }, + { + "epoch": 2.1783488105838953, + "grad_norm": 1.58159601688385, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8798856735229492, + "num_tokens": 653271779.0, + "step": 17124 + }, + { + "epoch": 2.178476020862486, + "grad_norm": 1.5244346857070923, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8841467499732971, + "num_tokens": 653312939.0, + "step": 17125 + }, + { + "epoch": 2.1786032311410763, + "grad_norm": 1.4758025407791138, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8917081356048584, + "num_tokens": 653352567.0, + "step": 17126 + }, + { + "epoch": 2.178730441419667, + "grad_norm": 1.6109808683395386, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8840349912643433, + "num_tokens": 653390823.0, + "step": 17127 + }, + { + "epoch": 2.1788576516982574, + "grad_norm": 1.4942357540130615, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8877853155136108, + "num_tokens": 653430220.0, + "step": 17128 + }, + { + "epoch": 2.178984861976848, + "grad_norm": 1.4683181047439575, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8825206756591797, + "num_tokens": 653475067.0, + "step": 17129 + }, + { + "epoch": 2.179112072255438, + "grad_norm": 1.614538311958313, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8807810544967651, + "num_tokens": 653512563.0, + "step": 17130 + }, + { + "epoch": 2.1792392825340285, + "grad_norm": 1.6187161207199097, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8940215706825256, + "num_tokens": 653546539.0, + "step": 17131 + }, + { + "epoch": 2.179366492812619, + "grad_norm": 1.5957636833190918, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8928611278533936, + "num_tokens": 653586459.0, + "step": 17132 + }, + { + "epoch": 2.1794937030912096, + "grad_norm": 1.5334362983703613, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8888237476348877, + "num_tokens": 653623879.0, + "step": 17133 + }, + { + "epoch": 2.1796209133698, + "grad_norm": 1.6651890277862549, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8815648555755615, + "num_tokens": 653661828.0, + "step": 17134 + }, + { + "epoch": 2.1797481236483907, + "grad_norm": 1.381042718887329, + "learning_rate": 1e-06, + "loss": 0.2728, + "mean_token_accuracy": 0.9015050530433655, + "num_tokens": 653702019.0, + "step": 17135 + }, + { + "epoch": 2.179875333926981, + "grad_norm": 1.4211530685424805, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8919612169265747, + "num_tokens": 653744073.0, + "step": 17136 + }, + { + "epoch": 2.1800025442055717, + "grad_norm": 1.6795763969421387, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8817893862724304, + "num_tokens": 653780964.0, + "step": 17137 + }, + { + "epoch": 2.1801297544841622, + "grad_norm": 1.4564441442489624, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8937874436378479, + "num_tokens": 653819040.0, + "step": 17138 + }, + { + "epoch": 2.1802569647627528, + "grad_norm": 1.5153236389160156, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8896670937538147, + "num_tokens": 653857630.0, + "step": 17139 + }, + { + "epoch": 2.1803841750413433, + "grad_norm": 1.513638973236084, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8877860307693481, + "num_tokens": 653896236.0, + "step": 17140 + }, + { + "epoch": 2.180511385319934, + "grad_norm": 1.5376484394073486, + "learning_rate": 1e-06, + "loss": 0.2831, + "mean_token_accuracy": 0.8979418277740479, + "num_tokens": 653933116.0, + "step": 17141 + }, + { + "epoch": 2.1806385955985244, + "grad_norm": 1.6468276977539062, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8769205808639526, + "num_tokens": 653971110.0, + "step": 17142 + }, + { + "epoch": 2.180765805877115, + "grad_norm": 1.5173277854919434, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8883224129676819, + "num_tokens": 654010551.0, + "step": 17143 + }, + { + "epoch": 2.1808930161557054, + "grad_norm": 1.4333138465881348, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8903055191040039, + "num_tokens": 654049889.0, + "step": 17144 + }, + { + "epoch": 2.181020226434296, + "grad_norm": 1.5650827884674072, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8710882663726807, + "num_tokens": 654087782.0, + "step": 17145 + }, + { + "epoch": 2.1811474367128865, + "grad_norm": 1.4829959869384766, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8815116882324219, + "num_tokens": 654128520.0, + "step": 17146 + }, + { + "epoch": 2.181274646991477, + "grad_norm": 1.5971050262451172, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8844795227050781, + "num_tokens": 654166739.0, + "step": 17147 + }, + { + "epoch": 2.1814018572700675, + "grad_norm": 1.3012032508850098, + "learning_rate": 1e-06, + "loss": 0.2765, + "mean_token_accuracy": 0.8997035026550293, + "num_tokens": 654212682.0, + "step": 17148 + }, + { + "epoch": 2.181529067548658, + "grad_norm": 1.7093371152877808, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8772540092468262, + "num_tokens": 654248648.0, + "step": 17149 + }, + { + "epoch": 2.1816562778272486, + "grad_norm": 1.562574028968811, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.89109206199646, + "num_tokens": 654284421.0, + "step": 17150 + }, + { + "epoch": 2.181783488105839, + "grad_norm": 1.5350463390350342, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8808143138885498, + "num_tokens": 654327969.0, + "step": 17151 + }, + { + "epoch": 2.1819106983844296, + "grad_norm": 1.5260977745056152, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.8968666791915894, + "num_tokens": 654367141.0, + "step": 17152 + }, + { + "epoch": 2.18203790866302, + "grad_norm": 1.6021697521209717, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8805707693099976, + "num_tokens": 654405412.0, + "step": 17153 + }, + { + "epoch": 2.1821651189416107, + "grad_norm": 1.6440935134887695, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8775221109390259, + "num_tokens": 654443477.0, + "step": 17154 + }, + { + "epoch": 2.1822923292202008, + "grad_norm": 1.7490416765213013, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8628386855125427, + "num_tokens": 654481399.0, + "step": 17155 + }, + { + "epoch": 2.1824195394987913, + "grad_norm": 1.820860743522644, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.888763427734375, + "num_tokens": 654513104.0, + "step": 17156 + }, + { + "epoch": 2.182546749777382, + "grad_norm": 1.7519726753234863, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8806277513504028, + "num_tokens": 654547295.0, + "step": 17157 + }, + { + "epoch": 2.1826739600559724, + "grad_norm": 1.5356570482254028, + "learning_rate": 1e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.8995307683944702, + "num_tokens": 654582783.0, + "step": 17158 + }, + { + "epoch": 2.182801170334563, + "grad_norm": 1.5087578296661377, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8842946290969849, + "num_tokens": 654620555.0, + "step": 17159 + }, + { + "epoch": 2.1829283806131534, + "grad_norm": 1.5042994022369385, + "learning_rate": 1e-06, + "loss": 0.2689, + "mean_token_accuracy": 0.9020476341247559, + "num_tokens": 654657291.0, + "step": 17160 + }, + { + "epoch": 2.183055590891744, + "grad_norm": 1.5038566589355469, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8928861021995544, + "num_tokens": 654695173.0, + "step": 17161 + }, + { + "epoch": 2.1831828011703345, + "grad_norm": 1.523444414138794, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.880207359790802, + "num_tokens": 654736372.0, + "step": 17162 + }, + { + "epoch": 2.183310011448925, + "grad_norm": 1.6736228466033936, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8794810771942139, + "num_tokens": 654772012.0, + "step": 17163 + }, + { + "epoch": 2.1834372217275155, + "grad_norm": 1.6452350616455078, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8734461665153503, + "num_tokens": 654812130.0, + "step": 17164 + }, + { + "epoch": 2.183564432006106, + "grad_norm": 1.5111827850341797, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.8911605477333069, + "num_tokens": 654849282.0, + "step": 17165 + }, + { + "epoch": 2.1836916422846966, + "grad_norm": 1.6763964891433716, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.87920081615448, + "num_tokens": 654884788.0, + "step": 17166 + }, + { + "epoch": 2.183818852563287, + "grad_norm": 1.5073734521865845, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8812730312347412, + "num_tokens": 654927278.0, + "step": 17167 + }, + { + "epoch": 2.1839460628418776, + "grad_norm": 1.7007681131362915, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8809123039245605, + "num_tokens": 654961546.0, + "step": 17168 + }, + { + "epoch": 2.184073273120468, + "grad_norm": 1.6284544467926025, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.892162024974823, + "num_tokens": 654997088.0, + "step": 17169 + }, + { + "epoch": 2.1842004833990587, + "grad_norm": 1.7581497430801392, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8907686471939087, + "num_tokens": 655026759.0, + "step": 17170 + }, + { + "epoch": 2.1843276936776492, + "grad_norm": 1.6418211460113525, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8723255395889282, + "num_tokens": 655063615.0, + "step": 17171 + }, + { + "epoch": 2.1844549039562398, + "grad_norm": 1.831395149230957, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8808904886245728, + "num_tokens": 655097324.0, + "step": 17172 + }, + { + "epoch": 2.1845821142348303, + "grad_norm": 1.551910638809204, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8825955986976624, + "num_tokens": 655135455.0, + "step": 17173 + }, + { + "epoch": 2.184709324513421, + "grad_norm": 1.4419714212417603, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8881401419639587, + "num_tokens": 655178170.0, + "step": 17174 + }, + { + "epoch": 2.1848365347920113, + "grad_norm": 1.5690853595733643, + "learning_rate": 1e-06, + "loss": 0.2768, + "mean_token_accuracy": 0.9008592963218689, + "num_tokens": 655212920.0, + "step": 17175 + }, + { + "epoch": 2.184963745070602, + "grad_norm": 1.7077127695083618, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8764914870262146, + "num_tokens": 655252652.0, + "step": 17176 + }, + { + "epoch": 2.1850909553491924, + "grad_norm": 1.4780150651931763, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8823560476303101, + "num_tokens": 655292699.0, + "step": 17177 + }, + { + "epoch": 2.185218165627783, + "grad_norm": 1.662410020828247, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8855502605438232, + "num_tokens": 655330599.0, + "step": 17178 + }, + { + "epoch": 2.1853453759063735, + "grad_norm": 1.645459771156311, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8815135955810547, + "num_tokens": 655367770.0, + "step": 17179 + }, + { + "epoch": 2.1854725861849635, + "grad_norm": 1.5535242557525635, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8820631504058838, + "num_tokens": 655410152.0, + "step": 17180 + }, + { + "epoch": 2.185599796463554, + "grad_norm": 1.6203851699829102, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8841788172721863, + "num_tokens": 655445825.0, + "step": 17181 + }, + { + "epoch": 2.1857270067421446, + "grad_norm": 1.6211830377578735, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8788983821868896, + "num_tokens": 655488081.0, + "step": 17182 + }, + { + "epoch": 2.185854217020735, + "grad_norm": 1.4816703796386719, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8941298723220825, + "num_tokens": 655527678.0, + "step": 17183 + }, + { + "epoch": 2.1859814272993257, + "grad_norm": 1.4586284160614014, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8794316053390503, + "num_tokens": 655572656.0, + "step": 17184 + }, + { + "epoch": 2.186108637577916, + "grad_norm": 1.6183111667633057, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8883901238441467, + "num_tokens": 655608268.0, + "step": 17185 + }, + { + "epoch": 2.1862358478565067, + "grad_norm": 1.5182253122329712, + "learning_rate": 1e-06, + "loss": 0.2713, + "mean_token_accuracy": 0.8996727466583252, + "num_tokens": 655644525.0, + "step": 17186 + }, + { + "epoch": 2.1863630581350972, + "grad_norm": 1.4860846996307373, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8798791170120239, + "num_tokens": 655684871.0, + "step": 17187 + }, + { + "epoch": 2.1864902684136878, + "grad_norm": 1.5919874906539917, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8846497535705566, + "num_tokens": 655720342.0, + "step": 17188 + }, + { + "epoch": 2.1866174786922783, + "grad_norm": 1.413214921951294, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8947102427482605, + "num_tokens": 655762966.0, + "step": 17189 + }, + { + "epoch": 2.186744688970869, + "grad_norm": 1.4718291759490967, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8785367012023926, + "num_tokens": 655805543.0, + "step": 17190 + }, + { + "epoch": 2.1868718992494594, + "grad_norm": 1.4361544847488403, + "learning_rate": 1e-06, + "loss": 0.2767, + "mean_token_accuracy": 0.899670422077179, + "num_tokens": 655844694.0, + "step": 17191 + }, + { + "epoch": 2.18699910952805, + "grad_norm": 1.5778303146362305, + "learning_rate": 1e-06, + "loss": 0.2608, + "mean_token_accuracy": 0.9046359062194824, + "num_tokens": 655878490.0, + "step": 17192 + }, + { + "epoch": 2.1871263198066404, + "grad_norm": 1.5102566480636597, + "learning_rate": 1e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.8975315690040588, + "num_tokens": 655917069.0, + "step": 17193 + }, + { + "epoch": 2.187253530085231, + "grad_norm": 1.5026084184646606, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8916640281677246, + "num_tokens": 655958066.0, + "step": 17194 + }, + { + "epoch": 2.1873807403638215, + "grad_norm": 1.6125789880752563, + "learning_rate": 1e-06, + "loss": 0.282, + "mean_token_accuracy": 0.8992406129837036, + "num_tokens": 655991916.0, + "step": 17195 + }, + { + "epoch": 2.187507950642412, + "grad_norm": 1.5149935483932495, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8870773315429688, + "num_tokens": 656030208.0, + "step": 17196 + }, + { + "epoch": 2.1876351609210025, + "grad_norm": 1.5265777111053467, + "learning_rate": 1e-06, + "loss": 0.2909, + "mean_token_accuracy": 0.8938275575637817, + "num_tokens": 656070443.0, + "step": 17197 + }, + { + "epoch": 2.187762371199593, + "grad_norm": 1.6627166271209717, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8834898471832275, + "num_tokens": 656107765.0, + "step": 17198 + }, + { + "epoch": 2.1878895814781836, + "grad_norm": 1.6073318719863892, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8744964599609375, + "num_tokens": 656147830.0, + "step": 17199 + }, + { + "epoch": 2.188016791756774, + "grad_norm": 1.4589998722076416, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8914514780044556, + "num_tokens": 656189028.0, + "step": 17200 + }, + { + "epoch": 2.1881440020353646, + "grad_norm": 1.5198169946670532, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8873996138572693, + "num_tokens": 656227934.0, + "step": 17201 + }, + { + "epoch": 2.188271212313955, + "grad_norm": 1.6227047443389893, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8807841539382935, + "num_tokens": 656262984.0, + "step": 17202 + }, + { + "epoch": 2.1883984225925452, + "grad_norm": 1.8118696212768555, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8712871670722961, + "num_tokens": 656296944.0, + "step": 17203 + }, + { + "epoch": 2.188525632871136, + "grad_norm": 1.441658854484558, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8820819854736328, + "num_tokens": 656340115.0, + "step": 17204 + }, + { + "epoch": 2.1886528431497263, + "grad_norm": 1.5690947771072388, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8861525058746338, + "num_tokens": 656375620.0, + "step": 17205 + }, + { + "epoch": 2.188780053428317, + "grad_norm": 1.6808494329452515, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8841090202331543, + "num_tokens": 656409290.0, + "step": 17206 + }, + { + "epoch": 2.1889072637069074, + "grad_norm": 1.557403326034546, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8874274492263794, + "num_tokens": 656444515.0, + "step": 17207 + }, + { + "epoch": 2.189034473985498, + "grad_norm": 1.514695405960083, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8855708837509155, + "num_tokens": 656487638.0, + "step": 17208 + }, + { + "epoch": 2.1891616842640884, + "grad_norm": 1.6377503871917725, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8880807757377625, + "num_tokens": 656523510.0, + "step": 17209 + }, + { + "epoch": 2.189288894542679, + "grad_norm": 1.4989007711410522, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8919272422790527, + "num_tokens": 656564143.0, + "step": 17210 + }, + { + "epoch": 2.1894161048212695, + "grad_norm": 1.6180286407470703, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8877617120742798, + "num_tokens": 656598285.0, + "step": 17211 + }, + { + "epoch": 2.18954331509986, + "grad_norm": 1.523314118385315, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.883892297744751, + "num_tokens": 656641531.0, + "step": 17212 + }, + { + "epoch": 2.1896705253784505, + "grad_norm": 1.595162034034729, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8955327868461609, + "num_tokens": 656681782.0, + "step": 17213 + }, + { + "epoch": 2.189797735657041, + "grad_norm": 1.6840018033981323, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8922165632247925, + "num_tokens": 656717935.0, + "step": 17214 + }, + { + "epoch": 2.1899249459356316, + "grad_norm": 1.5541056394577026, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8795611262321472, + "num_tokens": 656757538.0, + "step": 17215 + }, + { + "epoch": 2.190052156214222, + "grad_norm": 1.6081217527389526, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8720093369483948, + "num_tokens": 656795427.0, + "step": 17216 + }, + { + "epoch": 2.1901793664928126, + "grad_norm": 1.5193849802017212, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.881790041923523, + "num_tokens": 656838754.0, + "step": 17217 + }, + { + "epoch": 2.190306576771403, + "grad_norm": 1.7165601253509521, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8729872703552246, + "num_tokens": 656875014.0, + "step": 17218 + }, + { + "epoch": 2.1904337870499937, + "grad_norm": 1.5382400751113892, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8850242495536804, + "num_tokens": 656914064.0, + "step": 17219 + }, + { + "epoch": 2.1905609973285842, + "grad_norm": 1.6586486101150513, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8782208561897278, + "num_tokens": 656947640.0, + "step": 17220 + }, + { + "epoch": 2.1906882076071748, + "grad_norm": 1.6776466369628906, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8769400119781494, + "num_tokens": 656985203.0, + "step": 17221 + }, + { + "epoch": 2.1908154178857653, + "grad_norm": 1.5338963270187378, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8840774297714233, + "num_tokens": 657023421.0, + "step": 17222 + }, + { + "epoch": 2.190942628164356, + "grad_norm": 1.3962328433990479, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8889623880386353, + "num_tokens": 657067094.0, + "step": 17223 + }, + { + "epoch": 2.1910698384429463, + "grad_norm": 1.4131455421447754, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8944143056869507, + "num_tokens": 657110404.0, + "step": 17224 + }, + { + "epoch": 2.191197048721537, + "grad_norm": 1.5968471765518188, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8724422454833984, + "num_tokens": 657147949.0, + "step": 17225 + }, + { + "epoch": 2.1913242590001274, + "grad_norm": 1.693390965461731, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8877958059310913, + "num_tokens": 657180063.0, + "step": 17226 + }, + { + "epoch": 2.191451469278718, + "grad_norm": 1.5348678827285767, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8945790529251099, + "num_tokens": 657218577.0, + "step": 17227 + }, + { + "epoch": 2.191578679557308, + "grad_norm": 1.5714731216430664, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8893566131591797, + "num_tokens": 657258860.0, + "step": 17228 + }, + { + "epoch": 2.1917058898358985, + "grad_norm": 1.6792094707489014, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8724166750907898, + "num_tokens": 657294902.0, + "step": 17229 + }, + { + "epoch": 2.191833100114489, + "grad_norm": 1.6321911811828613, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8824392557144165, + "num_tokens": 657331931.0, + "step": 17230 + }, + { + "epoch": 2.1919603103930796, + "grad_norm": 1.6150593757629395, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8812318444252014, + "num_tokens": 657367692.0, + "step": 17231 + }, + { + "epoch": 2.19208752067167, + "grad_norm": 1.6383788585662842, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8656398057937622, + "num_tokens": 657409785.0, + "step": 17232 + }, + { + "epoch": 2.1922147309502606, + "grad_norm": 1.6154253482818604, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8762731552124023, + "num_tokens": 657444435.0, + "step": 17233 + }, + { + "epoch": 2.192341941228851, + "grad_norm": 1.5556443929672241, + "learning_rate": 1e-06, + "loss": 0.2691, + "mean_token_accuracy": 0.9024841785430908, + "num_tokens": 657480054.0, + "step": 17234 + }, + { + "epoch": 2.1924691515074417, + "grad_norm": 1.7300429344177246, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8860255479812622, + "num_tokens": 657512814.0, + "step": 17235 + }, + { + "epoch": 2.1925963617860322, + "grad_norm": 1.5359830856323242, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8899717330932617, + "num_tokens": 657550734.0, + "step": 17236 + }, + { + "epoch": 2.1927235720646228, + "grad_norm": 1.5063855648040771, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8924834728240967, + "num_tokens": 657587855.0, + "step": 17237 + }, + { + "epoch": 2.1928507823432133, + "grad_norm": 1.4231144189834595, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8950604796409607, + "num_tokens": 657631080.0, + "step": 17238 + }, + { + "epoch": 2.192977992621804, + "grad_norm": 1.622185230255127, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8784075379371643, + "num_tokens": 657670092.0, + "step": 17239 + }, + { + "epoch": 2.1931052029003943, + "grad_norm": 1.5065606832504272, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8891521096229553, + "num_tokens": 657708224.0, + "step": 17240 + }, + { + "epoch": 2.193232413178985, + "grad_norm": 1.456008791923523, + "learning_rate": 1e-06, + "loss": 0.275, + "mean_token_accuracy": 0.90019291639328, + "num_tokens": 657748626.0, + "step": 17241 + }, + { + "epoch": 2.1933596234575754, + "grad_norm": 1.7008990049362183, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8789912462234497, + "num_tokens": 657782603.0, + "step": 17242 + }, + { + "epoch": 2.193486833736166, + "grad_norm": 1.5115714073181152, + "learning_rate": 1e-06, + "loss": 0.27, + "mean_token_accuracy": 0.903814435005188, + "num_tokens": 657815950.0, + "step": 17243 + }, + { + "epoch": 2.1936140440147565, + "grad_norm": 1.6992629766464233, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8714531064033508, + "num_tokens": 657853353.0, + "step": 17244 + }, + { + "epoch": 2.193741254293347, + "grad_norm": 1.5396167039871216, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8900542855262756, + "num_tokens": 657891494.0, + "step": 17245 + }, + { + "epoch": 2.1938684645719375, + "grad_norm": 1.752660870552063, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8774658441543579, + "num_tokens": 657924851.0, + "step": 17246 + }, + { + "epoch": 2.193995674850528, + "grad_norm": 1.521363377571106, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8743748068809509, + "num_tokens": 657968896.0, + "step": 17247 + }, + { + "epoch": 2.1941228851291186, + "grad_norm": 1.526540994644165, + "learning_rate": 1e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.8965442180633545, + "num_tokens": 658006297.0, + "step": 17248 + }, + { + "epoch": 2.194250095407709, + "grad_norm": 1.5976659059524536, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8897141218185425, + "num_tokens": 658041978.0, + "step": 17249 + }, + { + "epoch": 2.1943773056862996, + "grad_norm": 1.5910745859146118, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8812525272369385, + "num_tokens": 658079985.0, + "step": 17250 + }, + { + "epoch": 2.19450451596489, + "grad_norm": 1.559510588645935, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8835592865943909, + "num_tokens": 658119410.0, + "step": 17251 + }, + { + "epoch": 2.1946317262434807, + "grad_norm": 1.664853811264038, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8620762825012207, + "num_tokens": 658159692.0, + "step": 17252 + }, + { + "epoch": 2.1947589365220708, + "grad_norm": 1.7933095693588257, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8820524215698242, + "num_tokens": 658193742.0, + "step": 17253 + }, + { + "epoch": 2.1948861468006613, + "grad_norm": 1.559181571006775, + "learning_rate": 1e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.8978059887886047, + "num_tokens": 658231407.0, + "step": 17254 + }, + { + "epoch": 2.195013357079252, + "grad_norm": 1.612741470336914, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.877488374710083, + "num_tokens": 658271364.0, + "step": 17255 + }, + { + "epoch": 2.1951405673578424, + "grad_norm": 1.5800666809082031, + "learning_rate": 1e-06, + "loss": 0.287, + "mean_token_accuracy": 0.8955128192901611, + "num_tokens": 658304479.0, + "step": 17256 + }, + { + "epoch": 2.195267777636433, + "grad_norm": 1.6306790113449097, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8918113708496094, + "num_tokens": 658338519.0, + "step": 17257 + }, + { + "epoch": 2.1953949879150234, + "grad_norm": 1.5757882595062256, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.877779483795166, + "num_tokens": 658375325.0, + "step": 17258 + }, + { + "epoch": 2.195522198193614, + "grad_norm": 1.5583686828613281, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8890905380249023, + "num_tokens": 658416273.0, + "step": 17259 + }, + { + "epoch": 2.1956494084722045, + "grad_norm": 1.5514739751815796, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8847795128822327, + "num_tokens": 658458320.0, + "step": 17260 + }, + { + "epoch": 2.195776618750795, + "grad_norm": 1.5115001201629639, + "learning_rate": 1e-06, + "loss": 0.2885, + "mean_token_accuracy": 0.893500804901123, + "num_tokens": 658496232.0, + "step": 17261 + }, + { + "epoch": 2.1959038290293855, + "grad_norm": 1.6311519145965576, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8755726218223572, + "num_tokens": 658532209.0, + "step": 17262 + }, + { + "epoch": 2.196031039307976, + "grad_norm": 1.6660528182983398, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8761816024780273, + "num_tokens": 658568782.0, + "step": 17263 + }, + { + "epoch": 2.1961582495865666, + "grad_norm": 1.5197690725326538, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8824681639671326, + "num_tokens": 658613385.0, + "step": 17264 + }, + { + "epoch": 2.196285459865157, + "grad_norm": 1.7966899871826172, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8712610006332397, + "num_tokens": 658645647.0, + "step": 17265 + }, + { + "epoch": 2.1964126701437476, + "grad_norm": 1.4854077100753784, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8909270763397217, + "num_tokens": 658687328.0, + "step": 17266 + }, + { + "epoch": 2.196539880422338, + "grad_norm": 1.4906965494155884, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8836666345596313, + "num_tokens": 658727357.0, + "step": 17267 + }, + { + "epoch": 2.1966670907009287, + "grad_norm": 1.6553133726119995, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8842638731002808, + "num_tokens": 658762669.0, + "step": 17268 + }, + { + "epoch": 2.196794300979519, + "grad_norm": 1.4277952909469604, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8868159055709839, + "num_tokens": 658803436.0, + "step": 17269 + }, + { + "epoch": 2.1969215112581097, + "grad_norm": 1.7549357414245605, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8819436430931091, + "num_tokens": 658834720.0, + "step": 17270 + }, + { + "epoch": 2.1970487215367003, + "grad_norm": 1.5138756036758423, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8880652785301208, + "num_tokens": 658877242.0, + "step": 17271 + }, + { + "epoch": 2.197175931815291, + "grad_norm": 1.5235193967819214, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8858768939971924, + "num_tokens": 658919014.0, + "step": 17272 + }, + { + "epoch": 2.1973031420938813, + "grad_norm": 1.6637279987335205, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8800430297851562, + "num_tokens": 658953938.0, + "step": 17273 + }, + { + "epoch": 2.197430352372472, + "grad_norm": 1.5150898694992065, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8870195150375366, + "num_tokens": 658991543.0, + "step": 17274 + }, + { + "epoch": 2.1975575626510624, + "grad_norm": 1.5204039812088013, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.890315055847168, + "num_tokens": 659032390.0, + "step": 17275 + }, + { + "epoch": 2.197684772929653, + "grad_norm": 1.6240224838256836, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8895173668861389, + "num_tokens": 659067375.0, + "step": 17276 + }, + { + "epoch": 2.1978119832082434, + "grad_norm": 1.5713475942611694, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8862110376358032, + "num_tokens": 659102419.0, + "step": 17277 + }, + { + "epoch": 2.1979391934868335, + "grad_norm": 1.6018335819244385, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8845383524894714, + "num_tokens": 659139500.0, + "step": 17278 + }, + { + "epoch": 2.198066403765424, + "grad_norm": 1.8208203315734863, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8833426237106323, + "num_tokens": 659170548.0, + "step": 17279 + }, + { + "epoch": 2.1981936140440146, + "grad_norm": 1.6955147981643677, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8765662908554077, + "num_tokens": 659208923.0, + "step": 17280 + }, + { + "epoch": 2.198320824322605, + "grad_norm": 1.6815822124481201, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8775075674057007, + "num_tokens": 659243412.0, + "step": 17281 + }, + { + "epoch": 2.1984480346011956, + "grad_norm": 1.6391780376434326, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8905009627342224, + "num_tokens": 659280171.0, + "step": 17282 + }, + { + "epoch": 2.198575244879786, + "grad_norm": 1.4965169429779053, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8828681707382202, + "num_tokens": 659323022.0, + "step": 17283 + }, + { + "epoch": 2.1987024551583767, + "grad_norm": 1.6521141529083252, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8719016313552856, + "num_tokens": 659359630.0, + "step": 17284 + }, + { + "epoch": 2.1988296654369672, + "grad_norm": 1.4863569736480713, + "learning_rate": 1e-06, + "loss": 0.2601, + "mean_token_accuracy": 0.9047453999519348, + "num_tokens": 659399302.0, + "step": 17285 + }, + { + "epoch": 2.1989568757155578, + "grad_norm": 1.6900330781936646, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.8939559459686279, + "num_tokens": 659430448.0, + "step": 17286 + }, + { + "epoch": 2.1990840859941483, + "grad_norm": 1.4496958255767822, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8900277614593506, + "num_tokens": 659474504.0, + "step": 17287 + }, + { + "epoch": 2.199211296272739, + "grad_norm": 1.5720502138137817, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8927585482597351, + "num_tokens": 659510225.0, + "step": 17288 + }, + { + "epoch": 2.1993385065513293, + "grad_norm": 1.5499600172042847, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.872079074382782, + "num_tokens": 659551872.0, + "step": 17289 + }, + { + "epoch": 2.19946571682992, + "grad_norm": 1.6276707649230957, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8872387409210205, + "num_tokens": 659585540.0, + "step": 17290 + }, + { + "epoch": 2.1995929271085104, + "grad_norm": 1.528387427330017, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8774087429046631, + "num_tokens": 659627344.0, + "step": 17291 + }, + { + "epoch": 2.199720137387101, + "grad_norm": 1.5710113048553467, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8731480836868286, + "num_tokens": 659664613.0, + "step": 17292 + }, + { + "epoch": 2.1998473476656915, + "grad_norm": 1.3998862504959106, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.899863600730896, + "num_tokens": 659705711.0, + "step": 17293 + }, + { + "epoch": 2.199974557944282, + "grad_norm": 1.4866958856582642, + "learning_rate": 1e-06, + "loss": 0.2802, + "mean_token_accuracy": 0.8978258371353149, + "num_tokens": 659741854.0, + "step": 17294 + }, + { + "epoch": 2.2001017682228725, + "grad_norm": 1.6138440370559692, + "learning_rate": 1e-06, + "loss": 0.2494, + "mean_token_accuracy": 0.9078027606010437, + "num_tokens": 659774922.0, + "step": 17295 + }, + { + "epoch": 2.200228978501463, + "grad_norm": 1.5024124383926392, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8949059844017029, + "num_tokens": 659814051.0, + "step": 17296 + }, + { + "epoch": 2.2003561887800536, + "grad_norm": 1.7091230154037476, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8816729784011841, + "num_tokens": 659851520.0, + "step": 17297 + }, + { + "epoch": 2.200483399058644, + "grad_norm": 1.6577296257019043, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8918132781982422, + "num_tokens": 659886732.0, + "step": 17298 + }, + { + "epoch": 2.2006106093372346, + "grad_norm": 1.520433783531189, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8762261271476746, + "num_tokens": 659926774.0, + "step": 17299 + }, + { + "epoch": 2.200737819615825, + "grad_norm": 1.4585566520690918, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8923227787017822, + "num_tokens": 659969997.0, + "step": 17300 + }, + { + "epoch": 2.2008650298944152, + "grad_norm": 1.6139684915542603, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8775270581245422, + "num_tokens": 660008138.0, + "step": 17301 + }, + { + "epoch": 2.200992240173006, + "grad_norm": 1.3830363750457764, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8959705829620361, + "num_tokens": 660052034.0, + "step": 17302 + }, + { + "epoch": 2.2011194504515963, + "grad_norm": 1.681139349937439, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8828542828559875, + "num_tokens": 660084052.0, + "step": 17303 + }, + { + "epoch": 2.201246660730187, + "grad_norm": 1.5266300439834595, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.877665638923645, + "num_tokens": 660124689.0, + "step": 17304 + }, + { + "epoch": 2.2013738710087773, + "grad_norm": 1.4917793273925781, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.88298100233078, + "num_tokens": 660168395.0, + "step": 17305 + }, + { + "epoch": 2.201501081287368, + "grad_norm": 1.5302969217300415, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8792088031768799, + "num_tokens": 660211552.0, + "step": 17306 + }, + { + "epoch": 2.2016282915659584, + "grad_norm": 1.4567620754241943, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.88742995262146, + "num_tokens": 660252034.0, + "step": 17307 + }, + { + "epoch": 2.201755501844549, + "grad_norm": 1.5461993217468262, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8928142786026001, + "num_tokens": 660291431.0, + "step": 17308 + }, + { + "epoch": 2.2018827121231395, + "grad_norm": 1.468887448310852, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8899933695793152, + "num_tokens": 660333243.0, + "step": 17309 + }, + { + "epoch": 2.20200992240173, + "grad_norm": 1.5148842334747314, + "learning_rate": 1e-06, + "loss": 0.2792, + "mean_token_accuracy": 0.8978321552276611, + "num_tokens": 660369298.0, + "step": 17310 + }, + { + "epoch": 2.2021371326803205, + "grad_norm": 1.579486608505249, + "learning_rate": 1e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.8988295793533325, + "num_tokens": 660405642.0, + "step": 17311 + }, + { + "epoch": 2.202264342958911, + "grad_norm": 1.716410517692566, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8917239308357239, + "num_tokens": 660445224.0, + "step": 17312 + }, + { + "epoch": 2.2023915532375016, + "grad_norm": 1.505066990852356, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8870455026626587, + "num_tokens": 660487258.0, + "step": 17313 + }, + { + "epoch": 2.202518763516092, + "grad_norm": 1.5000497102737427, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8907086849212646, + "num_tokens": 660526699.0, + "step": 17314 + }, + { + "epoch": 2.2026459737946826, + "grad_norm": 1.5993191003799438, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.879952609539032, + "num_tokens": 660563089.0, + "step": 17315 + }, + { + "epoch": 2.202773184073273, + "grad_norm": 1.7459460496902466, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8883756995201111, + "num_tokens": 660596012.0, + "step": 17316 + }, + { + "epoch": 2.2029003943518637, + "grad_norm": 1.541221022605896, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8926385641098022, + "num_tokens": 660635985.0, + "step": 17317 + }, + { + "epoch": 2.203027604630454, + "grad_norm": 1.5455615520477295, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8870651721954346, + "num_tokens": 660673802.0, + "step": 17318 + }, + { + "epoch": 2.2031548149090447, + "grad_norm": 1.6068203449249268, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8875970840454102, + "num_tokens": 660711790.0, + "step": 17319 + }, + { + "epoch": 2.2032820251876353, + "grad_norm": 1.5216889381408691, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8918614387512207, + "num_tokens": 660750614.0, + "step": 17320 + }, + { + "epoch": 2.203409235466226, + "grad_norm": 1.464074730873108, + "learning_rate": 1e-06, + "loss": 0.2759, + "mean_token_accuracy": 0.899193525314331, + "num_tokens": 660788615.0, + "step": 17321 + }, + { + "epoch": 2.2035364457448163, + "grad_norm": 1.6289076805114746, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8967515230178833, + "num_tokens": 660823441.0, + "step": 17322 + }, + { + "epoch": 2.203663656023407, + "grad_norm": 1.605859637260437, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8927426338195801, + "num_tokens": 660863157.0, + "step": 17323 + }, + { + "epoch": 2.2037908663019974, + "grad_norm": 1.5423139333724976, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8848806619644165, + "num_tokens": 660903338.0, + "step": 17324 + }, + { + "epoch": 2.203918076580588, + "grad_norm": 1.5512244701385498, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.88791424036026, + "num_tokens": 660940244.0, + "step": 17325 + }, + { + "epoch": 2.204045286859178, + "grad_norm": 1.494657039642334, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8872592449188232, + "num_tokens": 660980654.0, + "step": 17326 + }, + { + "epoch": 2.2041724971377685, + "grad_norm": 1.6755053997039795, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8833569884300232, + "num_tokens": 661017580.0, + "step": 17327 + }, + { + "epoch": 2.204299707416359, + "grad_norm": 1.5356742143630981, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.888266921043396, + "num_tokens": 661055873.0, + "step": 17328 + }, + { + "epoch": 2.2044269176949496, + "grad_norm": 1.8995555639266968, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.884268045425415, + "num_tokens": 661095388.0, + "step": 17329 + }, + { + "epoch": 2.20455412797354, + "grad_norm": 1.579447627067566, + "learning_rate": 1e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.8933336138725281, + "num_tokens": 661136442.0, + "step": 17330 + }, + { + "epoch": 2.2046813382521306, + "grad_norm": 1.5835967063903809, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8863720893859863, + "num_tokens": 661174484.0, + "step": 17331 + }, + { + "epoch": 2.204808548530721, + "grad_norm": 1.6854925155639648, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8842357993125916, + "num_tokens": 661207425.0, + "step": 17332 + }, + { + "epoch": 2.2049357588093117, + "grad_norm": 1.7695170640945435, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8631608486175537, + "num_tokens": 661243022.0, + "step": 17333 + }, + { + "epoch": 2.2050629690879022, + "grad_norm": 1.637722134590149, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8867407441139221, + "num_tokens": 661277770.0, + "step": 17334 + }, + { + "epoch": 2.2051901793664928, + "grad_norm": 1.657769799232483, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8892682790756226, + "num_tokens": 661312512.0, + "step": 17335 + }, + { + "epoch": 2.2053173896450833, + "grad_norm": 1.5528444051742554, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8845747709274292, + "num_tokens": 661350432.0, + "step": 17336 + }, + { + "epoch": 2.205444599923674, + "grad_norm": 1.7327992916107178, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.861970067024231, + "num_tokens": 661388529.0, + "step": 17337 + }, + { + "epoch": 2.2055718102022643, + "grad_norm": 1.6026700735092163, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8971387147903442, + "num_tokens": 661422952.0, + "step": 17338 + }, + { + "epoch": 2.205699020480855, + "grad_norm": 1.5902618169784546, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8962932229042053, + "num_tokens": 661457424.0, + "step": 17339 + }, + { + "epoch": 2.2058262307594454, + "grad_norm": 1.5098469257354736, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8904283046722412, + "num_tokens": 661498566.0, + "step": 17340 + }, + { + "epoch": 2.205953441038036, + "grad_norm": 1.5526559352874756, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8932307362556458, + "num_tokens": 661536564.0, + "step": 17341 + }, + { + "epoch": 2.2060806513166265, + "grad_norm": 1.7301719188690186, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8770855069160461, + "num_tokens": 661572147.0, + "step": 17342 + }, + { + "epoch": 2.206207861595217, + "grad_norm": 1.5343555212020874, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8906151056289673, + "num_tokens": 661610747.0, + "step": 17343 + }, + { + "epoch": 2.2063350718738075, + "grad_norm": 1.5924391746520996, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8910950422286987, + "num_tokens": 661647262.0, + "step": 17344 + }, + { + "epoch": 2.206462282152398, + "grad_norm": 1.6968274116516113, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8781086206436157, + "num_tokens": 661679119.0, + "step": 17345 + }, + { + "epoch": 2.2065894924309886, + "grad_norm": 1.6229947805404663, + "learning_rate": 1e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.8976894617080688, + "num_tokens": 661712615.0, + "step": 17346 + }, + { + "epoch": 2.206716702709579, + "grad_norm": 1.716752529144287, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8671654462814331, + "num_tokens": 661749467.0, + "step": 17347 + }, + { + "epoch": 2.2068439129881696, + "grad_norm": 1.677323818206787, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.884526252746582, + "num_tokens": 661782762.0, + "step": 17348 + }, + { + "epoch": 2.20697112326676, + "grad_norm": 1.4404815435409546, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8876129984855652, + "num_tokens": 661823270.0, + "step": 17349 + }, + { + "epoch": 2.2070983335453507, + "grad_norm": 1.4263947010040283, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8968408107757568, + "num_tokens": 661862877.0, + "step": 17350 + }, + { + "epoch": 2.2072255438239408, + "grad_norm": 1.626187801361084, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8862718343734741, + "num_tokens": 661903964.0, + "step": 17351 + }, + { + "epoch": 2.2073527541025313, + "grad_norm": 1.697556972503662, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8732999563217163, + "num_tokens": 661938749.0, + "step": 17352 + }, + { + "epoch": 2.207479964381122, + "grad_norm": 1.7402509450912476, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8840216994285583, + "num_tokens": 661968897.0, + "step": 17353 + }, + { + "epoch": 2.2076071746597123, + "grad_norm": 1.5611801147460938, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8827738761901855, + "num_tokens": 662008514.0, + "step": 17354 + }, + { + "epoch": 2.207734384938303, + "grad_norm": 1.4515841007232666, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8854422569274902, + "num_tokens": 662047977.0, + "step": 17355 + }, + { + "epoch": 2.2078615952168934, + "grad_norm": 1.5992449522018433, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8834787607192993, + "num_tokens": 662085883.0, + "step": 17356 + }, + { + "epoch": 2.207988805495484, + "grad_norm": 1.4415465593338013, + "learning_rate": 1e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.8977693319320679, + "num_tokens": 662126943.0, + "step": 17357 + }, + { + "epoch": 2.2081160157740745, + "grad_norm": 1.426994800567627, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8944905996322632, + "num_tokens": 662171493.0, + "step": 17358 + }, + { + "epoch": 2.208243226052665, + "grad_norm": 1.640607237815857, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8791494965553284, + "num_tokens": 662208454.0, + "step": 17359 + }, + { + "epoch": 2.2083704363312555, + "grad_norm": 1.4811830520629883, + "learning_rate": 1e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.8963861465454102, + "num_tokens": 662245430.0, + "step": 17360 + }, + { + "epoch": 2.208497646609846, + "grad_norm": 1.6386011838912964, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8786901235580444, + "num_tokens": 662285557.0, + "step": 17361 + }, + { + "epoch": 2.2086248568884366, + "grad_norm": 1.55862557888031, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8908733129501343, + "num_tokens": 662321702.0, + "step": 17362 + }, + { + "epoch": 2.208752067167027, + "grad_norm": 1.6890583038330078, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.883730947971344, + "num_tokens": 662357406.0, + "step": 17363 + }, + { + "epoch": 2.2088792774456176, + "grad_norm": 1.6532362699508667, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8912352919578552, + "num_tokens": 662391362.0, + "step": 17364 + }, + { + "epoch": 2.209006487724208, + "grad_norm": 1.6170753240585327, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8690375089645386, + "num_tokens": 662430995.0, + "step": 17365 + }, + { + "epoch": 2.2091336980027987, + "grad_norm": 1.4158556461334229, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.888430655002594, + "num_tokens": 662472508.0, + "step": 17366 + }, + { + "epoch": 2.209260908281389, + "grad_norm": 1.5156444311141968, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8795910477638245, + "num_tokens": 662515498.0, + "step": 17367 + }, + { + "epoch": 2.2093881185599797, + "grad_norm": 1.6449384689331055, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8864281177520752, + "num_tokens": 662555359.0, + "step": 17368 + }, + { + "epoch": 2.2095153288385703, + "grad_norm": 1.4978065490722656, + "learning_rate": 1e-06, + "loss": 0.2778, + "mean_token_accuracy": 0.9004976749420166, + "num_tokens": 662594203.0, + "step": 17369 + }, + { + "epoch": 2.209642539117161, + "grad_norm": 1.6068543195724487, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8838682174682617, + "num_tokens": 662631823.0, + "step": 17370 + }, + { + "epoch": 2.2097697493957513, + "grad_norm": 1.6768256425857544, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8898845314979553, + "num_tokens": 662669492.0, + "step": 17371 + }, + { + "epoch": 2.209896959674342, + "grad_norm": 1.6411843299865723, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8777207136154175, + "num_tokens": 662706532.0, + "step": 17372 + }, + { + "epoch": 2.2100241699529324, + "grad_norm": 1.6862411499023438, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8755330443382263, + "num_tokens": 662742407.0, + "step": 17373 + }, + { + "epoch": 2.210151380231523, + "grad_norm": 1.5440694093704224, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8882445096969604, + "num_tokens": 662778206.0, + "step": 17374 + }, + { + "epoch": 2.2102785905101134, + "grad_norm": 1.6642322540283203, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.881651759147644, + "num_tokens": 662813390.0, + "step": 17375 + }, + { + "epoch": 2.2104058007887035, + "grad_norm": 1.589045524597168, + "learning_rate": 1e-06, + "loss": 0.2817, + "mean_token_accuracy": 0.895691990852356, + "num_tokens": 662847364.0, + "step": 17376 + }, + { + "epoch": 2.210533011067294, + "grad_norm": 1.46049165725708, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.887705385684967, + "num_tokens": 662888814.0, + "step": 17377 + }, + { + "epoch": 2.2106602213458846, + "grad_norm": 1.6244398355484009, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.881321907043457, + "num_tokens": 662922335.0, + "step": 17378 + }, + { + "epoch": 2.210787431624475, + "grad_norm": 1.454257607460022, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8906649351119995, + "num_tokens": 662966149.0, + "step": 17379 + }, + { + "epoch": 2.2109146419030656, + "grad_norm": 1.6964786052703857, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8921135663986206, + "num_tokens": 663000375.0, + "step": 17380 + }, + { + "epoch": 2.211041852181656, + "grad_norm": 1.6947842836380005, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8872842788696289, + "num_tokens": 663034146.0, + "step": 17381 + }, + { + "epoch": 2.2111690624602467, + "grad_norm": 1.6357365846633911, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.866101861000061, + "num_tokens": 663075281.0, + "step": 17382 + }, + { + "epoch": 2.211296272738837, + "grad_norm": 1.581403136253357, + "learning_rate": 1e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.8964521884918213, + "num_tokens": 663111139.0, + "step": 17383 + }, + { + "epoch": 2.2114234830174277, + "grad_norm": 1.4761093854904175, + "learning_rate": 1e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.8922388553619385, + "num_tokens": 663149872.0, + "step": 17384 + }, + { + "epoch": 2.2115506932960183, + "grad_norm": 1.6552016735076904, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8905487060546875, + "num_tokens": 663186221.0, + "step": 17385 + }, + { + "epoch": 2.211677903574609, + "grad_norm": 1.6655142307281494, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8915356397628784, + "num_tokens": 663221407.0, + "step": 17386 + }, + { + "epoch": 2.2118051138531993, + "grad_norm": 1.6312870979309082, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.884677529335022, + "num_tokens": 663260355.0, + "step": 17387 + }, + { + "epoch": 2.21193232413179, + "grad_norm": 1.4689915180206299, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8924044370651245, + "num_tokens": 663299766.0, + "step": 17388 + }, + { + "epoch": 2.2120595344103804, + "grad_norm": 1.533806324005127, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8724370002746582, + "num_tokens": 663342605.0, + "step": 17389 + }, + { + "epoch": 2.212186744688971, + "grad_norm": 1.568004846572876, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8854863047599792, + "num_tokens": 663381714.0, + "step": 17390 + }, + { + "epoch": 2.2123139549675614, + "grad_norm": 1.4525487422943115, + "learning_rate": 1e-06, + "loss": 0.271, + "mean_token_accuracy": 0.902102530002594, + "num_tokens": 663420408.0, + "step": 17391 + }, + { + "epoch": 2.212441165246152, + "grad_norm": 1.5884523391723633, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8888698816299438, + "num_tokens": 663457023.0, + "step": 17392 + }, + { + "epoch": 2.2125683755247425, + "grad_norm": 1.5121737718582153, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8833613991737366, + "num_tokens": 663498397.0, + "step": 17393 + }, + { + "epoch": 2.212695585803333, + "grad_norm": 1.6314228773117065, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8884948492050171, + "num_tokens": 663530445.0, + "step": 17394 + }, + { + "epoch": 2.2128227960819236, + "grad_norm": 1.5716949701309204, + "learning_rate": 1e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.8956985473632812, + "num_tokens": 663566195.0, + "step": 17395 + }, + { + "epoch": 2.212950006360514, + "grad_norm": 1.6254260540008545, + "learning_rate": 1e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.8931269645690918, + "num_tokens": 663601930.0, + "step": 17396 + }, + { + "epoch": 2.2130772166391046, + "grad_norm": 1.5466350317001343, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8709090948104858, + "num_tokens": 663642554.0, + "step": 17397 + }, + { + "epoch": 2.213204426917695, + "grad_norm": 1.5164121389389038, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8877687454223633, + "num_tokens": 663680516.0, + "step": 17398 + }, + { + "epoch": 2.2133316371962852, + "grad_norm": 1.6100441217422485, + "learning_rate": 1e-06, + "loss": 0.2885, + "mean_token_accuracy": 0.8953847885131836, + "num_tokens": 663713916.0, + "step": 17399 + }, + { + "epoch": 2.213458847474876, + "grad_norm": 1.4631415605545044, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8906097412109375, + "num_tokens": 663754273.0, + "step": 17400 + }, + { + "epoch": 2.2135860577534663, + "grad_norm": 1.4411171674728394, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8796924948692322, + "num_tokens": 663799139.0, + "step": 17401 + }, + { + "epoch": 2.213713268032057, + "grad_norm": 1.6098593473434448, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8938480615615845, + "num_tokens": 663833244.0, + "step": 17402 + }, + { + "epoch": 2.2138404783106473, + "grad_norm": 1.6753441095352173, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8769073486328125, + "num_tokens": 663871322.0, + "step": 17403 + }, + { + "epoch": 2.213967688589238, + "grad_norm": 1.519502878189087, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8900032639503479, + "num_tokens": 663912595.0, + "step": 17404 + }, + { + "epoch": 2.2140948988678284, + "grad_norm": 1.6465998888015747, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8874611854553223, + "num_tokens": 663946436.0, + "step": 17405 + }, + { + "epoch": 2.214222109146419, + "grad_norm": 1.584076166152954, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8849560022354126, + "num_tokens": 663983058.0, + "step": 17406 + }, + { + "epoch": 2.2143493194250095, + "grad_norm": 1.587238073348999, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8816986083984375, + "num_tokens": 664022940.0, + "step": 17407 + }, + { + "epoch": 2.2144765297036, + "grad_norm": 1.6765458583831787, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8608719706535339, + "num_tokens": 664059104.0, + "step": 17408 + }, + { + "epoch": 2.2146037399821905, + "grad_norm": 1.469447374343872, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8911029100418091, + "num_tokens": 664099051.0, + "step": 17409 + }, + { + "epoch": 2.214730950260781, + "grad_norm": 1.790117859840393, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8632653951644897, + "num_tokens": 664136192.0, + "step": 17410 + }, + { + "epoch": 2.2148581605393716, + "grad_norm": 1.5250049829483032, + "learning_rate": 1e-06, + "loss": 0.274, + "mean_token_accuracy": 0.901660144329071, + "num_tokens": 664173184.0, + "step": 17411 + }, + { + "epoch": 2.214985370817962, + "grad_norm": 1.7302882671356201, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.870324969291687, + "num_tokens": 664207129.0, + "step": 17412 + }, + { + "epoch": 2.2151125810965526, + "grad_norm": 1.5947861671447754, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8914978504180908, + "num_tokens": 664241478.0, + "step": 17413 + }, + { + "epoch": 2.215239791375143, + "grad_norm": 1.6591078042984009, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8793995380401611, + "num_tokens": 664279790.0, + "step": 17414 + }, + { + "epoch": 2.2153670016537337, + "grad_norm": 1.6020857095718384, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8838527202606201, + "num_tokens": 664318606.0, + "step": 17415 + }, + { + "epoch": 2.215494211932324, + "grad_norm": 1.5184842348098755, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8883662819862366, + "num_tokens": 664355518.0, + "step": 17416 + }, + { + "epoch": 2.2156214222109147, + "grad_norm": 1.539346694946289, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8874680399894714, + "num_tokens": 664396745.0, + "step": 17417 + }, + { + "epoch": 2.2157486324895053, + "grad_norm": 1.4923707246780396, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8846903443336487, + "num_tokens": 664440685.0, + "step": 17418 + }, + { + "epoch": 2.215875842768096, + "grad_norm": 1.621141791343689, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8858338594436646, + "num_tokens": 664477015.0, + "step": 17419 + }, + { + "epoch": 2.2160030530466863, + "grad_norm": 1.3993178606033325, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8938743472099304, + "num_tokens": 664522128.0, + "step": 17420 + }, + { + "epoch": 2.216130263325277, + "grad_norm": 1.5697942972183228, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8911974430084229, + "num_tokens": 664562535.0, + "step": 17421 + }, + { + "epoch": 2.2162574736038674, + "grad_norm": 1.7506952285766602, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8729774951934814, + "num_tokens": 664593945.0, + "step": 17422 + }, + { + "epoch": 2.216384683882458, + "grad_norm": 1.7530814409255981, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.888930082321167, + "num_tokens": 664625191.0, + "step": 17423 + }, + { + "epoch": 2.216511894161048, + "grad_norm": 1.5678472518920898, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8870552182197571, + "num_tokens": 664662637.0, + "step": 17424 + }, + { + "epoch": 2.2166391044396385, + "grad_norm": 1.629714012145996, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8869713544845581, + "num_tokens": 664697212.0, + "step": 17425 + }, + { + "epoch": 2.216766314718229, + "grad_norm": 1.9008837938308716, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8671471476554871, + "num_tokens": 664735021.0, + "step": 17426 + }, + { + "epoch": 2.2168935249968196, + "grad_norm": 1.607050895690918, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8842756152153015, + "num_tokens": 664769212.0, + "step": 17427 + }, + { + "epoch": 2.21702073527541, + "grad_norm": 1.6217098236083984, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8876221179962158, + "num_tokens": 664809211.0, + "step": 17428 + }, + { + "epoch": 2.2171479455540006, + "grad_norm": 1.6943038702011108, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8815478682518005, + "num_tokens": 664841788.0, + "step": 17429 + }, + { + "epoch": 2.217275155832591, + "grad_norm": 1.6051242351531982, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8888289332389832, + "num_tokens": 664876334.0, + "step": 17430 + }, + { + "epoch": 2.2174023661111817, + "grad_norm": 1.584524154663086, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.889129638671875, + "num_tokens": 664913720.0, + "step": 17431 + }, + { + "epoch": 2.217529576389772, + "grad_norm": 1.599474310874939, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8770263195037842, + "num_tokens": 664951595.0, + "step": 17432 + }, + { + "epoch": 2.2176567866683627, + "grad_norm": 1.4568161964416504, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8940004110336304, + "num_tokens": 664991823.0, + "step": 17433 + }, + { + "epoch": 2.2177839969469533, + "grad_norm": 1.478469729423523, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8870199918746948, + "num_tokens": 665032287.0, + "step": 17434 + }, + { + "epoch": 2.217911207225544, + "grad_norm": 1.361762523651123, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8867877125740051, + "num_tokens": 665078174.0, + "step": 17435 + }, + { + "epoch": 2.2180384175041343, + "grad_norm": 1.5673487186431885, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8905067443847656, + "num_tokens": 665115027.0, + "step": 17436 + }, + { + "epoch": 2.218165627782725, + "grad_norm": 1.5936709642410278, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8961343765258789, + "num_tokens": 665148218.0, + "step": 17437 + }, + { + "epoch": 2.2182928380613154, + "grad_norm": 1.5272800922393799, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.876000165939331, + "num_tokens": 665189408.0, + "step": 17438 + }, + { + "epoch": 2.218420048339906, + "grad_norm": 1.5374476909637451, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8918304443359375, + "num_tokens": 665228731.0, + "step": 17439 + }, + { + "epoch": 2.2185472586184964, + "grad_norm": 1.5797213315963745, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8834954500198364, + "num_tokens": 665266874.0, + "step": 17440 + }, + { + "epoch": 2.218674468897087, + "grad_norm": 1.5796672105789185, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8788718581199646, + "num_tokens": 665305188.0, + "step": 17441 + }, + { + "epoch": 2.2188016791756775, + "grad_norm": 1.414747714996338, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8817718029022217, + "num_tokens": 665348755.0, + "step": 17442 + }, + { + "epoch": 2.218928889454268, + "grad_norm": 1.7093173265457153, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8942255973815918, + "num_tokens": 665380061.0, + "step": 17443 + }, + { + "epoch": 2.2190560997328586, + "grad_norm": 1.5464507341384888, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8846951127052307, + "num_tokens": 665418346.0, + "step": 17444 + }, + { + "epoch": 2.219183310011449, + "grad_norm": 1.5541596412658691, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.877747654914856, + "num_tokens": 665460350.0, + "step": 17445 + }, + { + "epoch": 2.2193105202900396, + "grad_norm": 1.4679455757141113, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.8956400156021118, + "num_tokens": 665498270.0, + "step": 17446 + }, + { + "epoch": 2.21943773056863, + "grad_norm": 1.5720456838607788, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8730885982513428, + "num_tokens": 665535934.0, + "step": 17447 + }, + { + "epoch": 2.2195649408472207, + "grad_norm": 1.4143061637878418, + "learning_rate": 1e-06, + "loss": 0.2833, + "mean_token_accuracy": 0.8969453573226929, + "num_tokens": 665578383.0, + "step": 17448 + }, + { + "epoch": 2.2196921511258108, + "grad_norm": 1.5531622171401978, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8762940764427185, + "num_tokens": 665622693.0, + "step": 17449 + }, + { + "epoch": 2.2198193614044013, + "grad_norm": 1.6489613056182861, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8886224031448364, + "num_tokens": 665654663.0, + "step": 17450 + }, + { + "epoch": 2.219946571682992, + "grad_norm": 1.5878915786743164, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8826568126678467, + "num_tokens": 665691950.0, + "step": 17451 + }, + { + "epoch": 2.2200737819615823, + "grad_norm": 1.5730394124984741, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8931034803390503, + "num_tokens": 665727062.0, + "step": 17452 + }, + { + "epoch": 2.220200992240173, + "grad_norm": 1.494868516921997, + "learning_rate": 1e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.8951386213302612, + "num_tokens": 665766531.0, + "step": 17453 + }, + { + "epoch": 2.2203282025187634, + "grad_norm": 1.628676176071167, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8862349987030029, + "num_tokens": 665801057.0, + "step": 17454 + }, + { + "epoch": 2.220455412797354, + "grad_norm": 1.5568362474441528, + "learning_rate": 1e-06, + "loss": 0.2862, + "mean_token_accuracy": 0.8961303234100342, + "num_tokens": 665837947.0, + "step": 17455 + }, + { + "epoch": 2.2205826230759445, + "grad_norm": 1.5551748275756836, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8863235712051392, + "num_tokens": 665874172.0, + "step": 17456 + }, + { + "epoch": 2.220709833354535, + "grad_norm": 1.5113887786865234, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8871132135391235, + "num_tokens": 665917258.0, + "step": 17457 + }, + { + "epoch": 2.2208370436331255, + "grad_norm": 1.6004974842071533, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8736822605133057, + "num_tokens": 665956940.0, + "step": 17458 + }, + { + "epoch": 2.220964253911716, + "grad_norm": 1.6443363428115845, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8838001489639282, + "num_tokens": 665993283.0, + "step": 17459 + }, + { + "epoch": 2.2210914641903066, + "grad_norm": 1.4181076288223267, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8935221433639526, + "num_tokens": 666035841.0, + "step": 17460 + }, + { + "epoch": 2.221218674468897, + "grad_norm": 1.4798438549041748, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8926723003387451, + "num_tokens": 666074919.0, + "step": 17461 + }, + { + "epoch": 2.2213458847474876, + "grad_norm": 1.6425837278366089, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.877930760383606, + "num_tokens": 666112074.0, + "step": 17462 + }, + { + "epoch": 2.221473095026078, + "grad_norm": 1.4299747943878174, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8942480683326721, + "num_tokens": 666151439.0, + "step": 17463 + }, + { + "epoch": 2.2216003053046687, + "grad_norm": 1.452583909034729, + "learning_rate": 1e-06, + "loss": 0.2612, + "mean_token_accuracy": 0.9062879085540771, + "num_tokens": 666191492.0, + "step": 17464 + }, + { + "epoch": 2.221727515583259, + "grad_norm": 1.4869500398635864, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8876971006393433, + "num_tokens": 666234333.0, + "step": 17465 + }, + { + "epoch": 2.2218547258618497, + "grad_norm": 1.6320438385009766, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8758947253227234, + "num_tokens": 666274464.0, + "step": 17466 + }, + { + "epoch": 2.2219819361404403, + "grad_norm": 1.630011796951294, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.8990180492401123, + "num_tokens": 666310669.0, + "step": 17467 + }, + { + "epoch": 2.222109146419031, + "grad_norm": 1.4846843481063843, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8899096250534058, + "num_tokens": 666353330.0, + "step": 17468 + }, + { + "epoch": 2.2222363566976213, + "grad_norm": 1.4445877075195312, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.891753077507019, + "num_tokens": 666393967.0, + "step": 17469 + }, + { + "epoch": 2.222363566976212, + "grad_norm": 1.5544098615646362, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8649054765701294, + "num_tokens": 666438339.0, + "step": 17470 + }, + { + "epoch": 2.2224907772548024, + "grad_norm": 1.5533679723739624, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8854129314422607, + "num_tokens": 666477635.0, + "step": 17471 + }, + { + "epoch": 2.222617987533393, + "grad_norm": 1.5657460689544678, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8763144612312317, + "num_tokens": 666519658.0, + "step": 17472 + }, + { + "epoch": 2.2227451978119834, + "grad_norm": 1.4722059965133667, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.88849937915802, + "num_tokens": 666558886.0, + "step": 17473 + }, + { + "epoch": 2.2228724080905735, + "grad_norm": 1.5111907720565796, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.885339617729187, + "num_tokens": 666596822.0, + "step": 17474 + }, + { + "epoch": 2.222999618369164, + "grad_norm": 1.5648977756500244, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8877404928207397, + "num_tokens": 666636203.0, + "step": 17475 + }, + { + "epoch": 2.2231268286477546, + "grad_norm": 1.5418174266815186, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8870956897735596, + "num_tokens": 666674207.0, + "step": 17476 + }, + { + "epoch": 2.223254038926345, + "grad_norm": 1.4883852005004883, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8727145791053772, + "num_tokens": 666715897.0, + "step": 17477 + }, + { + "epoch": 2.2233812492049356, + "grad_norm": 1.630842685699463, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8839482069015503, + "num_tokens": 666754695.0, + "step": 17478 + }, + { + "epoch": 2.223508459483526, + "grad_norm": 1.54689621925354, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8909462690353394, + "num_tokens": 666793532.0, + "step": 17479 + }, + { + "epoch": 2.2236356697621167, + "grad_norm": 1.4713691473007202, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8934656381607056, + "num_tokens": 666835588.0, + "step": 17480 + }, + { + "epoch": 2.223762880040707, + "grad_norm": 1.5194774866104126, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8819515705108643, + "num_tokens": 666874429.0, + "step": 17481 + }, + { + "epoch": 2.2238900903192977, + "grad_norm": 1.7102997303009033, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8860995173454285, + "num_tokens": 666906643.0, + "step": 17482 + }, + { + "epoch": 2.2240173005978883, + "grad_norm": 1.6260634660720825, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8807226419448853, + "num_tokens": 666942504.0, + "step": 17483 + }, + { + "epoch": 2.224144510876479, + "grad_norm": 1.5381577014923096, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8943853378295898, + "num_tokens": 666977554.0, + "step": 17484 + }, + { + "epoch": 2.2242717211550693, + "grad_norm": 1.4634191989898682, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8864839673042297, + "num_tokens": 667020267.0, + "step": 17485 + }, + { + "epoch": 2.22439893143366, + "grad_norm": 1.6677905321121216, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8873691558837891, + "num_tokens": 667053035.0, + "step": 17486 + }, + { + "epoch": 2.2245261417122504, + "grad_norm": 1.5315320491790771, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.893219530582428, + "num_tokens": 667090758.0, + "step": 17487 + }, + { + "epoch": 2.224653351990841, + "grad_norm": 1.4773412942886353, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8867096900939941, + "num_tokens": 667133092.0, + "step": 17488 + }, + { + "epoch": 2.2247805622694314, + "grad_norm": 1.563076138496399, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8911573886871338, + "num_tokens": 667170671.0, + "step": 17489 + }, + { + "epoch": 2.224907772548022, + "grad_norm": 1.7491663694381714, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8819723129272461, + "num_tokens": 667205944.0, + "step": 17490 + }, + { + "epoch": 2.2250349828266125, + "grad_norm": 1.7031203508377075, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8729419708251953, + "num_tokens": 667242452.0, + "step": 17491 + }, + { + "epoch": 2.225162193105203, + "grad_norm": 1.4371010065078735, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8900870084762573, + "num_tokens": 667284036.0, + "step": 17492 + }, + { + "epoch": 2.2252894033837936, + "grad_norm": 1.6991630792617798, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8609983921051025, + "num_tokens": 667321462.0, + "step": 17493 + }, + { + "epoch": 2.225416613662384, + "grad_norm": 1.6423338651657104, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8668075799942017, + "num_tokens": 667358291.0, + "step": 17494 + }, + { + "epoch": 2.2255438239409746, + "grad_norm": 1.565553903579712, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8841540813446045, + "num_tokens": 667396934.0, + "step": 17495 + }, + { + "epoch": 2.225671034219565, + "grad_norm": 1.5871925354003906, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8802079558372498, + "num_tokens": 667435011.0, + "step": 17496 + }, + { + "epoch": 2.225798244498155, + "grad_norm": 1.4254964590072632, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8832303285598755, + "num_tokens": 667479751.0, + "step": 17497 + }, + { + "epoch": 2.225925454776746, + "grad_norm": 1.5337283611297607, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8866033554077148, + "num_tokens": 667517508.0, + "step": 17498 + }, + { + "epoch": 2.2260526650553363, + "grad_norm": 1.7170078754425049, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8876597881317139, + "num_tokens": 667549018.0, + "step": 17499 + }, + { + "epoch": 2.226179875333927, + "grad_norm": 1.5433018207550049, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8878947496414185, + "num_tokens": 667588000.0, + "step": 17500 + }, + { + "epoch": 2.2263070856125173, + "grad_norm": 1.5410761833190918, + "learning_rate": 1e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.8958439826965332, + "num_tokens": 667623951.0, + "step": 17501 + }, + { + "epoch": 2.226434295891108, + "grad_norm": 1.5032243728637695, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8887382745742798, + "num_tokens": 667664586.0, + "step": 17502 + }, + { + "epoch": 2.2265615061696984, + "grad_norm": 1.4526619911193848, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.893176794052124, + "num_tokens": 667705602.0, + "step": 17503 + }, + { + "epoch": 2.226688716448289, + "grad_norm": 1.602340817451477, + "learning_rate": 1e-06, + "loss": 0.2791, + "mean_token_accuracy": 0.8970851898193359, + "num_tokens": 667739596.0, + "step": 17504 + }, + { + "epoch": 2.2268159267268794, + "grad_norm": 1.6200456619262695, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8897936344146729, + "num_tokens": 667774159.0, + "step": 17505 + }, + { + "epoch": 2.22694313700547, + "grad_norm": 1.4650661945343018, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8810856342315674, + "num_tokens": 667815842.0, + "step": 17506 + }, + { + "epoch": 2.2270703472840605, + "grad_norm": 1.6175179481506348, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8910176157951355, + "num_tokens": 667848065.0, + "step": 17507 + }, + { + "epoch": 2.227197557562651, + "grad_norm": 1.6484344005584717, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.876939594745636, + "num_tokens": 667888594.0, + "step": 17508 + }, + { + "epoch": 2.2273247678412416, + "grad_norm": 1.5656907558441162, + "learning_rate": 1e-06, + "loss": 0.296, + "mean_token_accuracy": 0.893478274345398, + "num_tokens": 667925327.0, + "step": 17509 + }, + { + "epoch": 2.227451978119832, + "grad_norm": 1.8816101551055908, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8822648525238037, + "num_tokens": 667955685.0, + "step": 17510 + }, + { + "epoch": 2.2275791883984226, + "grad_norm": 1.684800624847412, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8684426546096802, + "num_tokens": 667994765.0, + "step": 17511 + }, + { + "epoch": 2.227706398677013, + "grad_norm": 1.5459706783294678, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8878307938575745, + "num_tokens": 668034843.0, + "step": 17512 + }, + { + "epoch": 2.2278336089556037, + "grad_norm": 1.6397532224655151, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8774734139442444, + "num_tokens": 668074773.0, + "step": 17513 + }, + { + "epoch": 2.227960819234194, + "grad_norm": 1.6249158382415771, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8836531639099121, + "num_tokens": 668111448.0, + "step": 17514 + }, + { + "epoch": 2.2280880295127847, + "grad_norm": 1.4580276012420654, + "learning_rate": 1e-06, + "loss": 0.2705, + "mean_token_accuracy": 0.9020940065383911, + "num_tokens": 668150452.0, + "step": 17515 + }, + { + "epoch": 2.2282152397913753, + "grad_norm": 1.705370306968689, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8902791738510132, + "num_tokens": 668187353.0, + "step": 17516 + }, + { + "epoch": 2.228342450069966, + "grad_norm": 1.508164405822754, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8767117857933044, + "num_tokens": 668232937.0, + "step": 17517 + }, + { + "epoch": 2.2284696603485563, + "grad_norm": 1.464515209197998, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8916805386543274, + "num_tokens": 668274852.0, + "step": 17518 + }, + { + "epoch": 2.228596870627147, + "grad_norm": 1.503166675567627, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.888647735118866, + "num_tokens": 668315226.0, + "step": 17519 + }, + { + "epoch": 2.2287240809057374, + "grad_norm": 1.5919109582901, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8783049583435059, + "num_tokens": 668350161.0, + "step": 17520 + }, + { + "epoch": 2.228851291184328, + "grad_norm": 1.6095653772354126, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8834882378578186, + "num_tokens": 668387212.0, + "step": 17521 + }, + { + "epoch": 2.228978501462918, + "grad_norm": 1.6443089246749878, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8727632164955139, + "num_tokens": 668425670.0, + "step": 17522 + }, + { + "epoch": 2.2291057117415085, + "grad_norm": 1.6213924884796143, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.895154595375061, + "num_tokens": 668459449.0, + "step": 17523 + }, + { + "epoch": 2.229232922020099, + "grad_norm": 1.6088920831680298, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8738244771957397, + "num_tokens": 668500049.0, + "step": 17524 + }, + { + "epoch": 2.2293601322986896, + "grad_norm": 1.5846811532974243, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8790225386619568, + "num_tokens": 668538300.0, + "step": 17525 + }, + { + "epoch": 2.22948734257728, + "grad_norm": 1.4702937602996826, + "learning_rate": 1e-06, + "loss": 0.266, + "mean_token_accuracy": 0.9033252596855164, + "num_tokens": 668574240.0, + "step": 17526 + }, + { + "epoch": 2.2296145528558706, + "grad_norm": 1.442931890487671, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8846510648727417, + "num_tokens": 668616008.0, + "step": 17527 + }, + { + "epoch": 2.229741763134461, + "grad_norm": 1.6288764476776123, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8858320713043213, + "num_tokens": 668648969.0, + "step": 17528 + }, + { + "epoch": 2.2298689734130517, + "grad_norm": 1.602959156036377, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8786424398422241, + "num_tokens": 668689753.0, + "step": 17529 + }, + { + "epoch": 2.229996183691642, + "grad_norm": 1.5228381156921387, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.889428436756134, + "num_tokens": 668729618.0, + "step": 17530 + }, + { + "epoch": 2.2301233939702327, + "grad_norm": 1.7453675270080566, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8883463740348816, + "num_tokens": 668762995.0, + "step": 17531 + }, + { + "epoch": 2.2302506042488233, + "grad_norm": 1.6133378744125366, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.866848349571228, + "num_tokens": 668802741.0, + "step": 17532 + }, + { + "epoch": 2.230377814527414, + "grad_norm": 1.534718632698059, + "learning_rate": 1e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.8977557420730591, + "num_tokens": 668838794.0, + "step": 17533 + }, + { + "epoch": 2.2305050248060043, + "grad_norm": 1.537728190422058, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8881815671920776, + "num_tokens": 668879676.0, + "step": 17534 + }, + { + "epoch": 2.230632235084595, + "grad_norm": 1.4491666555404663, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8869343996047974, + "num_tokens": 668921103.0, + "step": 17535 + }, + { + "epoch": 2.2307594453631854, + "grad_norm": 1.6893259286880493, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8964869976043701, + "num_tokens": 668953511.0, + "step": 17536 + }, + { + "epoch": 2.230886655641776, + "grad_norm": 1.550437569618225, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.891937792301178, + "num_tokens": 668992131.0, + "step": 17537 + }, + { + "epoch": 2.2310138659203664, + "grad_norm": 1.559181571006775, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8917929530143738, + "num_tokens": 669027064.0, + "step": 17538 + }, + { + "epoch": 2.231141076198957, + "grad_norm": 1.523012638092041, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.882347047328949, + "num_tokens": 669066664.0, + "step": 17539 + }, + { + "epoch": 2.2312682864775475, + "grad_norm": 1.6071476936340332, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8798779249191284, + "num_tokens": 669106368.0, + "step": 17540 + }, + { + "epoch": 2.231395496756138, + "grad_norm": 1.4419249296188354, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8920203447341919, + "num_tokens": 669144754.0, + "step": 17541 + }, + { + "epoch": 2.2315227070347285, + "grad_norm": 1.714411973953247, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8859798908233643, + "num_tokens": 669183191.0, + "step": 17542 + }, + { + "epoch": 2.231649917313319, + "grad_norm": 1.4861470460891724, + "learning_rate": 1e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.8955280184745789, + "num_tokens": 669219822.0, + "step": 17543 + }, + { + "epoch": 2.2317771275919096, + "grad_norm": 1.6424565315246582, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8922936916351318, + "num_tokens": 669252320.0, + "step": 17544 + }, + { + "epoch": 2.2319043378705, + "grad_norm": 1.6580629348754883, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8791202306747437, + "num_tokens": 669288382.0, + "step": 17545 + }, + { + "epoch": 2.2320315481490907, + "grad_norm": 1.5813548564910889, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8871736526489258, + "num_tokens": 669325839.0, + "step": 17546 + }, + { + "epoch": 2.2321587584276807, + "grad_norm": 1.5697503089904785, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8825840950012207, + "num_tokens": 669364177.0, + "step": 17547 + }, + { + "epoch": 2.2322859687062713, + "grad_norm": 1.5475183725357056, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8863661885261536, + "num_tokens": 669401558.0, + "step": 17548 + }, + { + "epoch": 2.232413178984862, + "grad_norm": 1.4771533012390137, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8914827704429626, + "num_tokens": 669441893.0, + "step": 17549 + }, + { + "epoch": 2.2325403892634523, + "grad_norm": 1.5560399293899536, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8816752433776855, + "num_tokens": 669483492.0, + "step": 17550 + }, + { + "epoch": 2.232667599542043, + "grad_norm": 1.469123363494873, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8893499374389648, + "num_tokens": 669528723.0, + "step": 17551 + }, + { + "epoch": 2.2327948098206334, + "grad_norm": 1.6024916172027588, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8828129768371582, + "num_tokens": 669564054.0, + "step": 17552 + }, + { + "epoch": 2.232922020099224, + "grad_norm": 1.5288010835647583, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8884509801864624, + "num_tokens": 669598841.0, + "step": 17553 + }, + { + "epoch": 2.2330492303778144, + "grad_norm": 1.502059817314148, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.888660728931427, + "num_tokens": 669636205.0, + "step": 17554 + }, + { + "epoch": 2.233176440656405, + "grad_norm": 1.648077368736267, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8846299648284912, + "num_tokens": 669672598.0, + "step": 17555 + }, + { + "epoch": 2.2333036509349955, + "grad_norm": 1.7205640077590942, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8753291964530945, + "num_tokens": 669706158.0, + "step": 17556 + }, + { + "epoch": 2.233430861213586, + "grad_norm": 1.557674765586853, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8862423896789551, + "num_tokens": 669742732.0, + "step": 17557 + }, + { + "epoch": 2.2335580714921766, + "grad_norm": 1.6773163080215454, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8684552311897278, + "num_tokens": 669783608.0, + "step": 17558 + }, + { + "epoch": 2.233685281770767, + "grad_norm": 1.6617012023925781, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8921617269515991, + "num_tokens": 669827421.0, + "step": 17559 + }, + { + "epoch": 2.2338124920493576, + "grad_norm": 1.662092685699463, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8796176910400391, + "num_tokens": 669864270.0, + "step": 17560 + }, + { + "epoch": 2.233939702327948, + "grad_norm": 1.6613962650299072, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8727466464042664, + "num_tokens": 669904993.0, + "step": 17561 + }, + { + "epoch": 2.2340669126065387, + "grad_norm": 1.5683714151382446, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8757416009902954, + "num_tokens": 669947173.0, + "step": 17562 + }, + { + "epoch": 2.234194122885129, + "grad_norm": 1.532855749130249, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8757005333900452, + "num_tokens": 669991365.0, + "step": 17563 + }, + { + "epoch": 2.2343213331637197, + "grad_norm": 1.5668519735336304, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8839049935340881, + "num_tokens": 670029043.0, + "step": 17564 + }, + { + "epoch": 2.2344485434423103, + "grad_norm": 1.5961635112762451, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8875758647918701, + "num_tokens": 670063776.0, + "step": 17565 + }, + { + "epoch": 2.234575753720901, + "grad_norm": 1.4867734909057617, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8736026883125305, + "num_tokens": 670106271.0, + "step": 17566 + }, + { + "epoch": 2.2347029639994913, + "grad_norm": 1.5825778245925903, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8750321865081787, + "num_tokens": 670147230.0, + "step": 17567 + }, + { + "epoch": 2.234830174278082, + "grad_norm": 1.4954298734664917, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8905152082443237, + "num_tokens": 670188131.0, + "step": 17568 + }, + { + "epoch": 2.2349573845566724, + "grad_norm": 1.6906903982162476, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8817225694656372, + "num_tokens": 670221324.0, + "step": 17569 + }, + { + "epoch": 2.235084594835263, + "grad_norm": 1.3620105981826782, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8969804048538208, + "num_tokens": 670263533.0, + "step": 17570 + }, + { + "epoch": 2.2352118051138534, + "grad_norm": 1.595253348350525, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8893609642982483, + "num_tokens": 670298685.0, + "step": 17571 + }, + { + "epoch": 2.2353390153924435, + "grad_norm": 1.464167594909668, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8911956548690796, + "num_tokens": 670340584.0, + "step": 17572 + }, + { + "epoch": 2.235466225671034, + "grad_norm": 1.602251410484314, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8830438852310181, + "num_tokens": 670381353.0, + "step": 17573 + }, + { + "epoch": 2.2355934359496246, + "grad_norm": 1.4865130186080933, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8808671832084656, + "num_tokens": 670425540.0, + "step": 17574 + }, + { + "epoch": 2.235720646228215, + "grad_norm": 1.5472837686538696, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8765113949775696, + "num_tokens": 670467851.0, + "step": 17575 + }, + { + "epoch": 2.2358478565068056, + "grad_norm": 1.5216673612594604, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8884369730949402, + "num_tokens": 670504990.0, + "step": 17576 + }, + { + "epoch": 2.235975066785396, + "grad_norm": 1.5202622413635254, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8794883489608765, + "num_tokens": 670546734.0, + "step": 17577 + }, + { + "epoch": 2.2361022770639867, + "grad_norm": 1.498859167098999, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.893558919429779, + "num_tokens": 670586515.0, + "step": 17578 + }, + { + "epoch": 2.236229487342577, + "grad_norm": 1.6331148147583008, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8812081217765808, + "num_tokens": 670623997.0, + "step": 17579 + }, + { + "epoch": 2.2363566976211677, + "grad_norm": 1.5726603269577026, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8880332708358765, + "num_tokens": 670664096.0, + "step": 17580 + }, + { + "epoch": 2.2364839078997583, + "grad_norm": 1.6001160144805908, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8828796148300171, + "num_tokens": 670699698.0, + "step": 17581 + }, + { + "epoch": 2.236611118178349, + "grad_norm": 1.6247854232788086, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8787295818328857, + "num_tokens": 670737305.0, + "step": 17582 + }, + { + "epoch": 2.2367383284569393, + "grad_norm": 1.7044944763183594, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.878058910369873, + "num_tokens": 670772563.0, + "step": 17583 + }, + { + "epoch": 2.23686553873553, + "grad_norm": 1.6114834547042847, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8788064122200012, + "num_tokens": 670810234.0, + "step": 17584 + }, + { + "epoch": 2.2369927490141204, + "grad_norm": 1.4887760877609253, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8890750408172607, + "num_tokens": 670850045.0, + "step": 17585 + }, + { + "epoch": 2.237119959292711, + "grad_norm": 1.5707809925079346, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8773806691169739, + "num_tokens": 670888719.0, + "step": 17586 + }, + { + "epoch": 2.2372471695713014, + "grad_norm": 1.4228832721710205, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.888138473033905, + "num_tokens": 670929401.0, + "step": 17587 + }, + { + "epoch": 2.237374379849892, + "grad_norm": 1.564724326133728, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8845522403717041, + "num_tokens": 670967918.0, + "step": 17588 + }, + { + "epoch": 2.2375015901284825, + "grad_norm": 1.639465570449829, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8849101662635803, + "num_tokens": 671001729.0, + "step": 17589 + }, + { + "epoch": 2.237628800407073, + "grad_norm": 1.5597925186157227, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.8936811685562134, + "num_tokens": 671040367.0, + "step": 17590 + }, + { + "epoch": 2.2377560106856635, + "grad_norm": 1.5568751096725464, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8749751448631287, + "num_tokens": 671080865.0, + "step": 17591 + }, + { + "epoch": 2.237883220964254, + "grad_norm": 1.6056339740753174, + "learning_rate": 1e-06, + "loss": 0.2791, + "mean_token_accuracy": 0.8951386213302612, + "num_tokens": 671114286.0, + "step": 17592 + }, + { + "epoch": 2.2380104312428446, + "grad_norm": 1.474068284034729, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8817300200462341, + "num_tokens": 671158823.0, + "step": 17593 + }, + { + "epoch": 2.238137641521435, + "grad_norm": 1.6174731254577637, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8951762914657593, + "num_tokens": 671193731.0, + "step": 17594 + }, + { + "epoch": 2.238264851800025, + "grad_norm": 1.5743762254714966, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8756616115570068, + "num_tokens": 671232788.0, + "step": 17595 + }, + { + "epoch": 2.238392062078616, + "grad_norm": 1.5390052795410156, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8906479477882385, + "num_tokens": 671272135.0, + "step": 17596 + }, + { + "epoch": 2.2385192723572063, + "grad_norm": 1.4224178791046143, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8852840065956116, + "num_tokens": 671317274.0, + "step": 17597 + }, + { + "epoch": 2.238646482635797, + "grad_norm": 1.591737985610962, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8824207782745361, + "num_tokens": 671354425.0, + "step": 17598 + }, + { + "epoch": 2.2387736929143873, + "grad_norm": 1.60933256149292, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8822459578514099, + "num_tokens": 671395030.0, + "step": 17599 + }, + { + "epoch": 2.238900903192978, + "grad_norm": 1.5992177724838257, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.883609414100647, + "num_tokens": 671431069.0, + "step": 17600 + }, + { + "epoch": 2.2390281134715684, + "grad_norm": 1.583459734916687, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8877244591712952, + "num_tokens": 671468458.0, + "step": 17601 + }, + { + "epoch": 2.239155323750159, + "grad_norm": 1.5803301334381104, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8965051174163818, + "num_tokens": 671505952.0, + "step": 17602 + }, + { + "epoch": 2.2392825340287494, + "grad_norm": 1.4840518236160278, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8936132192611694, + "num_tokens": 671547721.0, + "step": 17603 + }, + { + "epoch": 2.23940974430734, + "grad_norm": 1.4658877849578857, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8861591815948486, + "num_tokens": 671590413.0, + "step": 17604 + }, + { + "epoch": 2.2395369545859305, + "grad_norm": 1.5949655771255493, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8829020261764526, + "num_tokens": 671631546.0, + "step": 17605 + }, + { + "epoch": 2.239664164864521, + "grad_norm": 1.5578196048736572, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8787318468093872, + "num_tokens": 671668476.0, + "step": 17606 + }, + { + "epoch": 2.2397913751431116, + "grad_norm": 1.582922101020813, + "learning_rate": 1e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.894501268863678, + "num_tokens": 671703516.0, + "step": 17607 + }, + { + "epoch": 2.239918585421702, + "grad_norm": 1.517853856086731, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8888542056083679, + "num_tokens": 671742549.0, + "step": 17608 + }, + { + "epoch": 2.2400457957002926, + "grad_norm": 1.4751068353652954, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8869601488113403, + "num_tokens": 671783986.0, + "step": 17609 + }, + { + "epoch": 2.240173005978883, + "grad_norm": 1.5738954544067383, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8852472305297852, + "num_tokens": 671825250.0, + "step": 17610 + }, + { + "epoch": 2.2403002162574737, + "grad_norm": 1.6586055755615234, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8665924072265625, + "num_tokens": 671862655.0, + "step": 17611 + }, + { + "epoch": 2.240427426536064, + "grad_norm": 1.7161344289779663, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8682665824890137, + "num_tokens": 671899500.0, + "step": 17612 + }, + { + "epoch": 2.2405546368146547, + "grad_norm": 1.6032178401947021, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8932561278343201, + "num_tokens": 671934285.0, + "step": 17613 + }, + { + "epoch": 2.2406818470932452, + "grad_norm": 1.5734124183654785, + "learning_rate": 1e-06, + "loss": 0.2758, + "mean_token_accuracy": 0.8992676734924316, + "num_tokens": 671968704.0, + "step": 17614 + }, + { + "epoch": 2.2408090573718358, + "grad_norm": 1.4563071727752686, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8838934898376465, + "num_tokens": 672010755.0, + "step": 17615 + }, + { + "epoch": 2.2409362676504263, + "grad_norm": 1.614992618560791, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8722041845321655, + "num_tokens": 672049164.0, + "step": 17616 + }, + { + "epoch": 2.241063477929017, + "grad_norm": 1.5593594312667847, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8842473030090332, + "num_tokens": 672085215.0, + "step": 17617 + }, + { + "epoch": 2.2411906882076074, + "grad_norm": 1.4709094762802124, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8870411515235901, + "num_tokens": 672125167.0, + "step": 17618 + }, + { + "epoch": 2.241317898486198, + "grad_norm": 1.545983910560608, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8890491724014282, + "num_tokens": 672164377.0, + "step": 17619 + }, + { + "epoch": 2.241445108764788, + "grad_norm": 1.5277996063232422, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.891283392906189, + "num_tokens": 672204417.0, + "step": 17620 + }, + { + "epoch": 2.2415723190433785, + "grad_norm": 1.5820143222808838, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8786114454269409, + "num_tokens": 672243156.0, + "step": 17621 + }, + { + "epoch": 2.241699529321969, + "grad_norm": 1.6645314693450928, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8849618434906006, + "num_tokens": 672277690.0, + "step": 17622 + }, + { + "epoch": 2.2418267396005596, + "grad_norm": 1.5179275274276733, + "learning_rate": 1e-06, + "loss": 0.2836, + "mean_token_accuracy": 0.8958508968353271, + "num_tokens": 672314735.0, + "step": 17623 + }, + { + "epoch": 2.24195394987915, + "grad_norm": 1.508162021636963, + "learning_rate": 1e-06, + "loss": 0.2726, + "mean_token_accuracy": 0.898249626159668, + "num_tokens": 672350571.0, + "step": 17624 + }, + { + "epoch": 2.2420811601577406, + "grad_norm": 1.5811423063278198, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.875723123550415, + "num_tokens": 672390121.0, + "step": 17625 + }, + { + "epoch": 2.242208370436331, + "grad_norm": 1.5938911437988281, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8809027075767517, + "num_tokens": 672424221.0, + "step": 17626 + }, + { + "epoch": 2.2423355807149217, + "grad_norm": 1.5916614532470703, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8899825811386108, + "num_tokens": 672459381.0, + "step": 17627 + }, + { + "epoch": 2.242462790993512, + "grad_norm": 1.5288482904434204, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8853176832199097, + "num_tokens": 672502347.0, + "step": 17628 + }, + { + "epoch": 2.2425900012721027, + "grad_norm": 1.4890308380126953, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8809802532196045, + "num_tokens": 672543396.0, + "step": 17629 + }, + { + "epoch": 2.2427172115506933, + "grad_norm": 1.6601088047027588, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8755254149436951, + "num_tokens": 672582677.0, + "step": 17630 + }, + { + "epoch": 2.242844421829284, + "grad_norm": 1.6221179962158203, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.881729006767273, + "num_tokens": 672620564.0, + "step": 17631 + }, + { + "epoch": 2.2429716321078743, + "grad_norm": 1.3598533868789673, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8918057084083557, + "num_tokens": 672665187.0, + "step": 17632 + }, + { + "epoch": 2.243098842386465, + "grad_norm": 1.594802737236023, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8926678895950317, + "num_tokens": 672700432.0, + "step": 17633 + }, + { + "epoch": 2.2432260526650554, + "grad_norm": 1.653423547744751, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8863726258277893, + "num_tokens": 672736327.0, + "step": 17634 + }, + { + "epoch": 2.243353262943646, + "grad_norm": 1.6817232370376587, + "learning_rate": 1e-06, + "loss": 0.2712, + "mean_token_accuracy": 0.9014423489570618, + "num_tokens": 672768864.0, + "step": 17635 + }, + { + "epoch": 2.2434804732222364, + "grad_norm": 1.8309950828552246, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8664494752883911, + "num_tokens": 672802743.0, + "step": 17636 + }, + { + "epoch": 2.243607683500827, + "grad_norm": 1.4772608280181885, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8803436756134033, + "num_tokens": 672847570.0, + "step": 17637 + }, + { + "epoch": 2.2437348937794175, + "grad_norm": 1.7741496562957764, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8909443616867065, + "num_tokens": 672878709.0, + "step": 17638 + }, + { + "epoch": 2.243862104058008, + "grad_norm": 1.484257459640503, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.894149899482727, + "num_tokens": 672920082.0, + "step": 17639 + }, + { + "epoch": 2.2439893143365985, + "grad_norm": 1.5893105268478394, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8831808567047119, + "num_tokens": 672959516.0, + "step": 17640 + }, + { + "epoch": 2.244116524615189, + "grad_norm": 1.4069421291351318, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8866274356842041, + "num_tokens": 673003870.0, + "step": 17641 + }, + { + "epoch": 2.2442437348937796, + "grad_norm": 1.501484751701355, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8865249752998352, + "num_tokens": 673046789.0, + "step": 17642 + }, + { + "epoch": 2.24437094517237, + "grad_norm": 1.4002177715301514, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8865563869476318, + "num_tokens": 673090317.0, + "step": 17643 + }, + { + "epoch": 2.2444981554509607, + "grad_norm": 1.5064857006072998, + "learning_rate": 1e-06, + "loss": 0.2755, + "mean_token_accuracy": 0.8992857336997986, + "num_tokens": 673127215.0, + "step": 17644 + }, + { + "epoch": 2.2446253657295507, + "grad_norm": 1.5510585308074951, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8917087316513062, + "num_tokens": 673162524.0, + "step": 17645 + }, + { + "epoch": 2.2447525760081413, + "grad_norm": 1.563563585281372, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8755850791931152, + "num_tokens": 673205998.0, + "step": 17646 + }, + { + "epoch": 2.244879786286732, + "grad_norm": 1.6480212211608887, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8867598176002502, + "num_tokens": 673240618.0, + "step": 17647 + }, + { + "epoch": 2.2450069965653223, + "grad_norm": 1.4496909379959106, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8888410329818726, + "num_tokens": 673283827.0, + "step": 17648 + }, + { + "epoch": 2.245134206843913, + "grad_norm": 1.4008796215057373, + "learning_rate": 1e-06, + "loss": 0.2749, + "mean_token_accuracy": 0.8983477354049683, + "num_tokens": 673326252.0, + "step": 17649 + }, + { + "epoch": 2.2452614171225034, + "grad_norm": 1.610308051109314, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8744720816612244, + "num_tokens": 673365384.0, + "step": 17650 + }, + { + "epoch": 2.245388627401094, + "grad_norm": 1.625559687614441, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8919767141342163, + "num_tokens": 673401247.0, + "step": 17651 + }, + { + "epoch": 2.2455158376796844, + "grad_norm": 1.4789522886276245, + "learning_rate": 1e-06, + "loss": 0.2632, + "mean_token_accuracy": 0.9068838357925415, + "num_tokens": 673437938.0, + "step": 17652 + }, + { + "epoch": 2.245643047958275, + "grad_norm": 1.651739478111267, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8844606876373291, + "num_tokens": 673471028.0, + "step": 17653 + }, + { + "epoch": 2.2457702582368655, + "grad_norm": 1.5547720193862915, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8842151761054993, + "num_tokens": 673510319.0, + "step": 17654 + }, + { + "epoch": 2.245897468515456, + "grad_norm": 1.6000182628631592, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.891802966594696, + "num_tokens": 673548030.0, + "step": 17655 + }, + { + "epoch": 2.2460246787940465, + "grad_norm": 1.3706456422805786, + "learning_rate": 1e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.8973115086555481, + "num_tokens": 673590541.0, + "step": 17656 + }, + { + "epoch": 2.246151889072637, + "grad_norm": 1.64574134349823, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.883385181427002, + "num_tokens": 673624122.0, + "step": 17657 + }, + { + "epoch": 2.2462790993512276, + "grad_norm": 1.5737321376800537, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8846243619918823, + "num_tokens": 673665784.0, + "step": 17658 + }, + { + "epoch": 2.246406309629818, + "grad_norm": 1.4719898700714111, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8793318271636963, + "num_tokens": 673705256.0, + "step": 17659 + }, + { + "epoch": 2.2465335199084087, + "grad_norm": 1.7390570640563965, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8746500611305237, + "num_tokens": 673744086.0, + "step": 17660 + }, + { + "epoch": 2.246660730186999, + "grad_norm": 1.5593262910842896, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8879108428955078, + "num_tokens": 673785577.0, + "step": 17661 + }, + { + "epoch": 2.2467879404655897, + "grad_norm": 1.4823858737945557, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8800472021102905, + "num_tokens": 673829138.0, + "step": 17662 + }, + { + "epoch": 2.2469151507441802, + "grad_norm": 1.7592560052871704, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8804042339324951, + "num_tokens": 673860973.0, + "step": 17663 + }, + { + "epoch": 2.2470423610227708, + "grad_norm": 1.5078295469284058, + "learning_rate": 1e-06, + "loss": 0.2833, + "mean_token_accuracy": 0.8952596783638, + "num_tokens": 673900139.0, + "step": 17664 + }, + { + "epoch": 2.2471695713013613, + "grad_norm": 1.686696171760559, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8707564473152161, + "num_tokens": 673944677.0, + "step": 17665 + }, + { + "epoch": 2.247296781579952, + "grad_norm": 1.6400724649429321, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8764957189559937, + "num_tokens": 673988844.0, + "step": 17666 + }, + { + "epoch": 2.2474239918585424, + "grad_norm": 1.4809807538986206, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8880677223205566, + "num_tokens": 674032483.0, + "step": 17667 + }, + { + "epoch": 2.247551202137133, + "grad_norm": 1.6909795999526978, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8719388842582703, + "num_tokens": 674068741.0, + "step": 17668 + }, + { + "epoch": 2.2476784124157234, + "grad_norm": 1.4312235116958618, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8933656811714172, + "num_tokens": 674112227.0, + "step": 17669 + }, + { + "epoch": 2.2478056226943135, + "grad_norm": 1.6536768674850464, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8790196180343628, + "num_tokens": 674149918.0, + "step": 17670 + }, + { + "epoch": 2.247932832972904, + "grad_norm": 1.817878246307373, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8779765367507935, + "num_tokens": 674182689.0, + "step": 17671 + }, + { + "epoch": 2.2480600432514946, + "grad_norm": 1.6758888959884644, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8723828792572021, + "num_tokens": 674222713.0, + "step": 17672 + }, + { + "epoch": 2.248187253530085, + "grad_norm": 1.5738590955734253, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8842200636863708, + "num_tokens": 674260105.0, + "step": 17673 + }, + { + "epoch": 2.2483144638086756, + "grad_norm": 1.5642931461334229, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8809521198272705, + "num_tokens": 674298133.0, + "step": 17674 + }, + { + "epoch": 2.248441674087266, + "grad_norm": 1.5253022909164429, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8924734592437744, + "num_tokens": 674335246.0, + "step": 17675 + }, + { + "epoch": 2.2485688843658567, + "grad_norm": 1.486279845237732, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8887123465538025, + "num_tokens": 674375003.0, + "step": 17676 + }, + { + "epoch": 2.248696094644447, + "grad_norm": 1.676793098449707, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8787966966629028, + "num_tokens": 674410244.0, + "step": 17677 + }, + { + "epoch": 2.2488233049230377, + "grad_norm": 1.5866771936416626, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8752248287200928, + "num_tokens": 674450044.0, + "step": 17678 + }, + { + "epoch": 2.2489505152016283, + "grad_norm": 1.4491956233978271, + "learning_rate": 1e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.8920379877090454, + "num_tokens": 674490507.0, + "step": 17679 + }, + { + "epoch": 2.249077725480219, + "grad_norm": 1.579612135887146, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8973751068115234, + "num_tokens": 674525417.0, + "step": 17680 + }, + { + "epoch": 2.2492049357588093, + "grad_norm": 1.640993356704712, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8873255252838135, + "num_tokens": 674560446.0, + "step": 17681 + }, + { + "epoch": 2.2493321460374, + "grad_norm": 1.584324598312378, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8902845978736877, + "num_tokens": 674597527.0, + "step": 17682 + }, + { + "epoch": 2.2494593563159904, + "grad_norm": 1.5938472747802734, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8821884989738464, + "num_tokens": 674635444.0, + "step": 17683 + }, + { + "epoch": 2.249586566594581, + "grad_norm": 1.5395493507385254, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8894799947738647, + "num_tokens": 674672659.0, + "step": 17684 + }, + { + "epoch": 2.2497137768731714, + "grad_norm": 1.4993332624435425, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8898330926895142, + "num_tokens": 674711214.0, + "step": 17685 + }, + { + "epoch": 2.249840987151762, + "grad_norm": 1.6370429992675781, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8718736171722412, + "num_tokens": 674746744.0, + "step": 17686 + }, + { + "epoch": 2.2499681974303525, + "grad_norm": 1.732075572013855, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8691275715827942, + "num_tokens": 674782183.0, + "step": 17687 + }, + { + "epoch": 2.250095407708943, + "grad_norm": 1.5304538011550903, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.8933011293411255, + "num_tokens": 674822795.0, + "step": 17688 + }, + { + "epoch": 2.2502226179875335, + "grad_norm": 1.7917394638061523, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8738124370574951, + "num_tokens": 674857441.0, + "step": 17689 + }, + { + "epoch": 2.250349828266124, + "grad_norm": 1.5952612161636353, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8789659738540649, + "num_tokens": 674896144.0, + "step": 17690 + }, + { + "epoch": 2.2504770385447146, + "grad_norm": 1.6390104293823242, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.888886034488678, + "num_tokens": 674931104.0, + "step": 17691 + }, + { + "epoch": 2.250604248823305, + "grad_norm": 1.56111741065979, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8945401310920715, + "num_tokens": 674966541.0, + "step": 17692 + }, + { + "epoch": 2.250731459101895, + "grad_norm": 1.6007051467895508, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8824048042297363, + "num_tokens": 675003270.0, + "step": 17693 + }, + { + "epoch": 2.250858669380486, + "grad_norm": 1.4684826135635376, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8877695798873901, + "num_tokens": 675044470.0, + "step": 17694 + }, + { + "epoch": 2.2509858796590763, + "grad_norm": 1.5357428789138794, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8785074949264526, + "num_tokens": 675082750.0, + "step": 17695 + }, + { + "epoch": 2.251113089937667, + "grad_norm": 1.630014419555664, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8758171200752258, + "num_tokens": 675119160.0, + "step": 17696 + }, + { + "epoch": 2.2512403002162573, + "grad_norm": 1.6032154560089111, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8746238350868225, + "num_tokens": 675157850.0, + "step": 17697 + }, + { + "epoch": 2.251367510494848, + "grad_norm": 1.4724843502044678, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8900070786476135, + "num_tokens": 675198523.0, + "step": 17698 + }, + { + "epoch": 2.2514947207734384, + "grad_norm": 1.6459012031555176, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8653660416603088, + "num_tokens": 675236309.0, + "step": 17699 + }, + { + "epoch": 2.251621931052029, + "grad_norm": 1.682572603225708, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8732509613037109, + "num_tokens": 675273440.0, + "step": 17700 + }, + { + "epoch": 2.2517491413306194, + "grad_norm": 1.5702753067016602, + "learning_rate": 1e-06, + "loss": 0.2775, + "mean_token_accuracy": 0.8993688821792603, + "num_tokens": 675306129.0, + "step": 17701 + }, + { + "epoch": 2.25187635160921, + "grad_norm": 1.530739665031433, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8924944996833801, + "num_tokens": 675345955.0, + "step": 17702 + }, + { + "epoch": 2.2520035618878005, + "grad_norm": 1.579870581626892, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8779510259628296, + "num_tokens": 675388204.0, + "step": 17703 + }, + { + "epoch": 2.252130772166391, + "grad_norm": 1.6665265560150146, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8803709149360657, + "num_tokens": 675424478.0, + "step": 17704 + }, + { + "epoch": 2.2522579824449815, + "grad_norm": 1.4213511943817139, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8870102167129517, + "num_tokens": 675465282.0, + "step": 17705 + }, + { + "epoch": 2.252385192723572, + "grad_norm": 1.554985523223877, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8861589431762695, + "num_tokens": 675503012.0, + "step": 17706 + }, + { + "epoch": 2.2525124030021626, + "grad_norm": 1.4541032314300537, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8831085562705994, + "num_tokens": 675547407.0, + "step": 17707 + }, + { + "epoch": 2.252639613280753, + "grad_norm": 1.5860317945480347, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8883998394012451, + "num_tokens": 675584545.0, + "step": 17708 + }, + { + "epoch": 2.2527668235593437, + "grad_norm": 1.6036324501037598, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8834642767906189, + "num_tokens": 675620111.0, + "step": 17709 + }, + { + "epoch": 2.252894033837934, + "grad_norm": 1.6500760316848755, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8809110522270203, + "num_tokens": 675657396.0, + "step": 17710 + }, + { + "epoch": 2.2530212441165247, + "grad_norm": 1.6604746580123901, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8981183767318726, + "num_tokens": 675690360.0, + "step": 17711 + }, + { + "epoch": 2.2531484543951152, + "grad_norm": 1.5951417684555054, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8842583894729614, + "num_tokens": 675728794.0, + "step": 17712 + }, + { + "epoch": 2.2532756646737058, + "grad_norm": 1.5163800716400146, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8773825764656067, + "num_tokens": 675772514.0, + "step": 17713 + }, + { + "epoch": 2.2534028749522963, + "grad_norm": 1.5642176866531372, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8868434429168701, + "num_tokens": 675808072.0, + "step": 17714 + }, + { + "epoch": 2.253530085230887, + "grad_norm": 1.5616075992584229, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8772064447402954, + "num_tokens": 675846074.0, + "step": 17715 + }, + { + "epoch": 2.2536572955094774, + "grad_norm": 1.5123789310455322, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8878954648971558, + "num_tokens": 675888806.0, + "step": 17716 + }, + { + "epoch": 2.253784505788068, + "grad_norm": 1.5575907230377197, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8906840085983276, + "num_tokens": 675928077.0, + "step": 17717 + }, + { + "epoch": 2.253911716066658, + "grad_norm": 1.5187005996704102, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8897675275802612, + "num_tokens": 675964413.0, + "step": 17718 + }, + { + "epoch": 2.254038926345249, + "grad_norm": 1.7046009302139282, + "learning_rate": 1e-06, + "loss": 0.2796, + "mean_token_accuracy": 0.8985928297042847, + "num_tokens": 675995741.0, + "step": 17719 + }, + { + "epoch": 2.254166136623839, + "grad_norm": 1.5680530071258545, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.883754551410675, + "num_tokens": 676033783.0, + "step": 17720 + }, + { + "epoch": 2.2542933469024296, + "grad_norm": 1.5471000671386719, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8925747871398926, + "num_tokens": 676069481.0, + "step": 17721 + }, + { + "epoch": 2.25442055718102, + "grad_norm": 1.506423830986023, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8817866444587708, + "num_tokens": 676112916.0, + "step": 17722 + }, + { + "epoch": 2.2545477674596106, + "grad_norm": 1.4590510129928589, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8778878450393677, + "num_tokens": 676158714.0, + "step": 17723 + }, + { + "epoch": 2.254674977738201, + "grad_norm": 1.6149896383285522, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8924891948699951, + "num_tokens": 676193918.0, + "step": 17724 + }, + { + "epoch": 2.2548021880167917, + "grad_norm": 1.6119532585144043, + "learning_rate": 1e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.8957142233848572, + "num_tokens": 676229110.0, + "step": 17725 + }, + { + "epoch": 2.254929398295382, + "grad_norm": 1.3556488752365112, + "learning_rate": 1e-06, + "loss": 0.273, + "mean_token_accuracy": 0.9023572206497192, + "num_tokens": 676274829.0, + "step": 17726 + }, + { + "epoch": 2.2550566085739727, + "grad_norm": 1.5636037588119507, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8889665007591248, + "num_tokens": 676313233.0, + "step": 17727 + }, + { + "epoch": 2.2551838188525632, + "grad_norm": 1.5880002975463867, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8806122541427612, + "num_tokens": 676351691.0, + "step": 17728 + }, + { + "epoch": 2.2553110291311538, + "grad_norm": 1.6052420139312744, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8799698352813721, + "num_tokens": 676390473.0, + "step": 17729 + }, + { + "epoch": 2.2554382394097443, + "grad_norm": 1.5463742017745972, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8797534108161926, + "num_tokens": 676429311.0, + "step": 17730 + }, + { + "epoch": 2.255565449688335, + "grad_norm": 1.5369974374771118, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.886843204498291, + "num_tokens": 676470526.0, + "step": 17731 + }, + { + "epoch": 2.2556926599669254, + "grad_norm": 1.5529576539993286, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8892690539360046, + "num_tokens": 676506988.0, + "step": 17732 + }, + { + "epoch": 2.255819870245516, + "grad_norm": 1.613537073135376, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8842391967773438, + "num_tokens": 676541458.0, + "step": 17733 + }, + { + "epoch": 2.2559470805241064, + "grad_norm": 1.5870895385742188, + "learning_rate": 1e-06, + "loss": 0.2651, + "mean_token_accuracy": 0.9048882722854614, + "num_tokens": 676574770.0, + "step": 17734 + }, + { + "epoch": 2.256074290802697, + "grad_norm": 1.6637405157089233, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8878859877586365, + "num_tokens": 676610011.0, + "step": 17735 + }, + { + "epoch": 2.2562015010812875, + "grad_norm": 1.5623857975006104, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8790861368179321, + "num_tokens": 676648961.0, + "step": 17736 + }, + { + "epoch": 2.256328711359878, + "grad_norm": 1.4869893789291382, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.89146888256073, + "num_tokens": 676689208.0, + "step": 17737 + }, + { + "epoch": 2.2564559216384685, + "grad_norm": 1.618895173072815, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8883056640625, + "num_tokens": 676728127.0, + "step": 17738 + }, + { + "epoch": 2.256583131917059, + "grad_norm": 1.671882152557373, + "learning_rate": 1e-06, + "loss": 0.292, + "mean_token_accuracy": 0.8954516649246216, + "num_tokens": 676761586.0, + "step": 17739 + }, + { + "epoch": 2.2567103421956496, + "grad_norm": 1.5871504545211792, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8834491968154907, + "num_tokens": 676796027.0, + "step": 17740 + }, + { + "epoch": 2.2568375524742397, + "grad_norm": 1.7627623081207275, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8667937517166138, + "num_tokens": 676837870.0, + "step": 17741 + }, + { + "epoch": 2.2569647627528306, + "grad_norm": 1.5356152057647705, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8828086853027344, + "num_tokens": 676879315.0, + "step": 17742 + }, + { + "epoch": 2.2570919730314207, + "grad_norm": 1.567175269126892, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8934271335601807, + "num_tokens": 676915324.0, + "step": 17743 + }, + { + "epoch": 2.2572191833100113, + "grad_norm": 1.5519055128097534, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8866342902183533, + "num_tokens": 676953084.0, + "step": 17744 + }, + { + "epoch": 2.257346393588602, + "grad_norm": 1.4865034818649292, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8854118585586548, + "num_tokens": 676997836.0, + "step": 17745 + }, + { + "epoch": 2.2574736038671923, + "grad_norm": 1.7009692192077637, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.896203875541687, + "num_tokens": 677032951.0, + "step": 17746 + }, + { + "epoch": 2.257600814145783, + "grad_norm": 1.5885556936264038, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8924187421798706, + "num_tokens": 677069447.0, + "step": 17747 + }, + { + "epoch": 2.2577280244243734, + "grad_norm": 1.547344446182251, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8838657736778259, + "num_tokens": 677108884.0, + "step": 17748 + }, + { + "epoch": 2.257855234702964, + "grad_norm": 1.6974341869354248, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8844578266143799, + "num_tokens": 677151670.0, + "step": 17749 + }, + { + "epoch": 2.2579824449815544, + "grad_norm": 1.5521951913833618, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8816202878952026, + "num_tokens": 677192157.0, + "step": 17750 + }, + { + "epoch": 2.258109655260145, + "grad_norm": 1.4787931442260742, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.888434648513794, + "num_tokens": 677235173.0, + "step": 17751 + }, + { + "epoch": 2.2582368655387355, + "grad_norm": 1.4116756916046143, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8852013349533081, + "num_tokens": 677281450.0, + "step": 17752 + }, + { + "epoch": 2.258364075817326, + "grad_norm": 1.4546122550964355, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8924577236175537, + "num_tokens": 677320937.0, + "step": 17753 + }, + { + "epoch": 2.2584912860959165, + "grad_norm": 1.7083317041397095, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8917324542999268, + "num_tokens": 677353007.0, + "step": 17754 + }, + { + "epoch": 2.258618496374507, + "grad_norm": 1.412135362625122, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8797814846038818, + "num_tokens": 677397634.0, + "step": 17755 + }, + { + "epoch": 2.2587457066530976, + "grad_norm": 1.6386616230010986, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8790321350097656, + "num_tokens": 677435545.0, + "step": 17756 + }, + { + "epoch": 2.258872916931688, + "grad_norm": 1.4921166896820068, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8910872340202332, + "num_tokens": 677479320.0, + "step": 17757 + }, + { + "epoch": 2.2590001272102787, + "grad_norm": 1.5982837677001953, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8850741386413574, + "num_tokens": 677518193.0, + "step": 17758 + }, + { + "epoch": 2.259127337488869, + "grad_norm": 1.508717656135559, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.875507116317749, + "num_tokens": 677559020.0, + "step": 17759 + }, + { + "epoch": 2.2592545477674597, + "grad_norm": 1.5531026124954224, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8952372074127197, + "num_tokens": 677595222.0, + "step": 17760 + }, + { + "epoch": 2.2593817580460502, + "grad_norm": 1.5824543237686157, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8674466609954834, + "num_tokens": 677635014.0, + "step": 17761 + }, + { + "epoch": 2.2595089683246408, + "grad_norm": 1.540526032447815, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8853628039360046, + "num_tokens": 677674507.0, + "step": 17762 + }, + { + "epoch": 2.2596361786032313, + "grad_norm": 1.4535348415374756, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8891353011131287, + "num_tokens": 677717074.0, + "step": 17763 + }, + { + "epoch": 2.259763388881822, + "grad_norm": 1.6870083808898926, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.893440306186676, + "num_tokens": 677750495.0, + "step": 17764 + }, + { + "epoch": 2.2598905991604123, + "grad_norm": 1.6753408908843994, + "learning_rate": 1e-06, + "loss": 0.2791, + "mean_token_accuracy": 0.8952126502990723, + "num_tokens": 677784341.0, + "step": 17765 + }, + { + "epoch": 2.2600178094390024, + "grad_norm": 1.4960535764694214, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8876487612724304, + "num_tokens": 677821784.0, + "step": 17766 + }, + { + "epoch": 2.2601450197175934, + "grad_norm": 1.461410641670227, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.885027289390564, + "num_tokens": 677865271.0, + "step": 17767 + }, + { + "epoch": 2.2602722299961835, + "grad_norm": 1.6664751768112183, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8861709833145142, + "num_tokens": 677902930.0, + "step": 17768 + }, + { + "epoch": 2.260399440274774, + "grad_norm": 1.7051194906234741, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8784112930297852, + "num_tokens": 677942326.0, + "step": 17769 + }, + { + "epoch": 2.2605266505533645, + "grad_norm": 1.6557753086090088, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.881354808807373, + "num_tokens": 677981915.0, + "step": 17770 + }, + { + "epoch": 2.260653860831955, + "grad_norm": 1.5683293342590332, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8905493021011353, + "num_tokens": 678018616.0, + "step": 17771 + }, + { + "epoch": 2.2607810711105456, + "grad_norm": 1.6732134819030762, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8844695687294006, + "num_tokens": 678051686.0, + "step": 17772 + }, + { + "epoch": 2.260908281389136, + "grad_norm": 1.5927284955978394, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8790163993835449, + "num_tokens": 678087376.0, + "step": 17773 + }, + { + "epoch": 2.2610354916677267, + "grad_norm": 1.7530207633972168, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8768929839134216, + "num_tokens": 678120505.0, + "step": 17774 + }, + { + "epoch": 2.261162701946317, + "grad_norm": 1.5663472414016724, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8901634216308594, + "num_tokens": 678156278.0, + "step": 17775 + }, + { + "epoch": 2.2612899122249077, + "grad_norm": 1.5636422634124756, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8867858052253723, + "num_tokens": 678193608.0, + "step": 17776 + }, + { + "epoch": 2.2614171225034982, + "grad_norm": 1.5318886041641235, + "learning_rate": 1e-06, + "loss": 0.2827, + "mean_token_accuracy": 0.8994672298431396, + "num_tokens": 678231073.0, + "step": 17777 + }, + { + "epoch": 2.2615443327820888, + "grad_norm": 1.6517730951309204, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8797256946563721, + "num_tokens": 678268202.0, + "step": 17778 + }, + { + "epoch": 2.2616715430606793, + "grad_norm": 1.4787081480026245, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8967483639717102, + "num_tokens": 678309797.0, + "step": 17779 + }, + { + "epoch": 2.26179875333927, + "grad_norm": 1.5073654651641846, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.893146276473999, + "num_tokens": 678350305.0, + "step": 17780 + }, + { + "epoch": 2.2619259636178604, + "grad_norm": 1.5628776550292969, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8932270407676697, + "num_tokens": 678389270.0, + "step": 17781 + }, + { + "epoch": 2.262053173896451, + "grad_norm": 1.582964301109314, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8804044127464294, + "num_tokens": 678430429.0, + "step": 17782 + }, + { + "epoch": 2.2621803841750414, + "grad_norm": 1.77858304977417, + "learning_rate": 1e-06, + "loss": 0.2825, + "mean_token_accuracy": 0.8936715722084045, + "num_tokens": 678462911.0, + "step": 17783 + }, + { + "epoch": 2.262307594453632, + "grad_norm": 1.5708684921264648, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.878483772277832, + "num_tokens": 678503946.0, + "step": 17784 + }, + { + "epoch": 2.2624348047322225, + "grad_norm": 1.5524581670761108, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8851461410522461, + "num_tokens": 678542685.0, + "step": 17785 + }, + { + "epoch": 2.262562015010813, + "grad_norm": 1.4943028688430786, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8789419531822205, + "num_tokens": 678588186.0, + "step": 17786 + }, + { + "epoch": 2.2626892252894035, + "grad_norm": 1.5189456939697266, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.889199435710907, + "num_tokens": 678627683.0, + "step": 17787 + }, + { + "epoch": 2.262816435567994, + "grad_norm": 1.55866277217865, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.883287787437439, + "num_tokens": 678667772.0, + "step": 17788 + }, + { + "epoch": 2.2629436458465846, + "grad_norm": 1.5810596942901611, + "learning_rate": 1e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.8954014182090759, + "num_tokens": 678703886.0, + "step": 17789 + }, + { + "epoch": 2.263070856125175, + "grad_norm": 1.7061604261398315, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8846511840820312, + "num_tokens": 678736049.0, + "step": 17790 + }, + { + "epoch": 2.263198066403765, + "grad_norm": 1.5742050409317017, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8894147872924805, + "num_tokens": 678776106.0, + "step": 17791 + }, + { + "epoch": 2.263325276682356, + "grad_norm": 1.7267374992370605, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8787709474563599, + "num_tokens": 678811652.0, + "step": 17792 + }, + { + "epoch": 2.2634524869609463, + "grad_norm": 1.645740270614624, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8730241656303406, + "num_tokens": 678849858.0, + "step": 17793 + }, + { + "epoch": 2.263579697239537, + "grad_norm": 1.5685292482376099, + "learning_rate": 1e-06, + "loss": 0.2853, + "mean_token_accuracy": 0.8951773643493652, + "num_tokens": 678886431.0, + "step": 17794 + }, + { + "epoch": 2.2637069075181273, + "grad_norm": 1.4851057529449463, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.894302487373352, + "num_tokens": 678924664.0, + "step": 17795 + }, + { + "epoch": 2.263834117796718, + "grad_norm": 1.5117876529693604, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.88031005859375, + "num_tokens": 678967519.0, + "step": 17796 + }, + { + "epoch": 2.2639613280753084, + "grad_norm": 1.6374231576919556, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.891715407371521, + "num_tokens": 679004721.0, + "step": 17797 + }, + { + "epoch": 2.264088538353899, + "grad_norm": 1.63003671169281, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.887383759021759, + "num_tokens": 679039578.0, + "step": 17798 + }, + { + "epoch": 2.2642157486324894, + "grad_norm": 1.6759072542190552, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8600924015045166, + "num_tokens": 679076300.0, + "step": 17799 + }, + { + "epoch": 2.26434295891108, + "grad_norm": 1.6433712244033813, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8767704963684082, + "num_tokens": 679114189.0, + "step": 17800 + }, + { + "epoch": 2.2644701691896705, + "grad_norm": 1.4899390935897827, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8858119249343872, + "num_tokens": 679154646.0, + "step": 17801 + }, + { + "epoch": 2.264597379468261, + "grad_norm": 1.6136807203292847, + "learning_rate": 1e-06, + "loss": 0.2703, + "mean_token_accuracy": 0.9015722274780273, + "num_tokens": 679187154.0, + "step": 17802 + }, + { + "epoch": 2.2647245897468515, + "grad_norm": 1.5626680850982666, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8854416608810425, + "num_tokens": 679222396.0, + "step": 17803 + }, + { + "epoch": 2.264851800025442, + "grad_norm": 1.5547175407409668, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.891465961933136, + "num_tokens": 679259221.0, + "step": 17804 + }, + { + "epoch": 2.2649790103040326, + "grad_norm": 1.6636531352996826, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.885087251663208, + "num_tokens": 679294238.0, + "step": 17805 + }, + { + "epoch": 2.265106220582623, + "grad_norm": 1.6347033977508545, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8829292058944702, + "num_tokens": 679330610.0, + "step": 17806 + }, + { + "epoch": 2.2652334308612136, + "grad_norm": 1.5685995817184448, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8874578475952148, + "num_tokens": 679368652.0, + "step": 17807 + }, + { + "epoch": 2.265360641139804, + "grad_norm": 1.6869159936904907, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8640523552894592, + "num_tokens": 679403472.0, + "step": 17808 + }, + { + "epoch": 2.2654878514183947, + "grad_norm": 1.5969939231872559, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8789410591125488, + "num_tokens": 679442059.0, + "step": 17809 + }, + { + "epoch": 2.2656150616969852, + "grad_norm": 1.6854965686798096, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8877520561218262, + "num_tokens": 679475928.0, + "step": 17810 + }, + { + "epoch": 2.2657422719755758, + "grad_norm": 1.5597481727600098, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8795724511146545, + "num_tokens": 679516206.0, + "step": 17811 + }, + { + "epoch": 2.2658694822541663, + "grad_norm": 1.6709455251693726, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.892306387424469, + "num_tokens": 679549766.0, + "step": 17812 + }, + { + "epoch": 2.265996692532757, + "grad_norm": 1.6157922744750977, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8681995272636414, + "num_tokens": 679590660.0, + "step": 17813 + }, + { + "epoch": 2.2661239028113473, + "grad_norm": 1.6633700132369995, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8739961385726929, + "num_tokens": 679625939.0, + "step": 17814 + }, + { + "epoch": 2.266251113089938, + "grad_norm": 1.58626389503479, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8847206830978394, + "num_tokens": 679662556.0, + "step": 17815 + }, + { + "epoch": 2.266378323368528, + "grad_norm": 1.7755669355392456, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8721967339515686, + "num_tokens": 679695452.0, + "step": 17816 + }, + { + "epoch": 2.266505533647119, + "grad_norm": 1.6206789016723633, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8823578357696533, + "num_tokens": 679732227.0, + "step": 17817 + }, + { + "epoch": 2.266632743925709, + "grad_norm": 1.6249516010284424, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8931179642677307, + "num_tokens": 679765730.0, + "step": 17818 + }, + { + "epoch": 2.2667599542042995, + "grad_norm": 1.5242162942886353, + "learning_rate": 1e-06, + "loss": 0.2747, + "mean_token_accuracy": 0.9009367227554321, + "num_tokens": 679802169.0, + "step": 17819 + }, + { + "epoch": 2.26688716448289, + "grad_norm": 1.4859247207641602, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8945904970169067, + "num_tokens": 679843284.0, + "step": 17820 + }, + { + "epoch": 2.2670143747614806, + "grad_norm": 1.5210500955581665, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.885935366153717, + "num_tokens": 679882640.0, + "step": 17821 + }, + { + "epoch": 2.267141585040071, + "grad_norm": 1.5422091484069824, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8631056547164917, + "num_tokens": 679924404.0, + "step": 17822 + }, + { + "epoch": 2.2672687953186617, + "grad_norm": 1.814488172531128, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8815097212791443, + "num_tokens": 679955969.0, + "step": 17823 + }, + { + "epoch": 2.267396005597252, + "grad_norm": 1.6499531269073486, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.8942247629165649, + "num_tokens": 679990236.0, + "step": 17824 + }, + { + "epoch": 2.2675232158758427, + "grad_norm": 1.6193649768829346, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8907787799835205, + "num_tokens": 680024431.0, + "step": 17825 + }, + { + "epoch": 2.2676504261544332, + "grad_norm": 1.464919090270996, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8971970677375793, + "num_tokens": 680063969.0, + "step": 17826 + }, + { + "epoch": 2.2677776364330238, + "grad_norm": 1.7781963348388672, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8726097345352173, + "num_tokens": 680096352.0, + "step": 17827 + }, + { + "epoch": 2.2679048467116143, + "grad_norm": 1.5938701629638672, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8840430974960327, + "num_tokens": 680131924.0, + "step": 17828 + }, + { + "epoch": 2.268032056990205, + "grad_norm": 1.5936996936798096, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8858566284179688, + "num_tokens": 680170441.0, + "step": 17829 + }, + { + "epoch": 2.2681592672687954, + "grad_norm": 1.5256351232528687, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8755253553390503, + "num_tokens": 680215612.0, + "step": 17830 + }, + { + "epoch": 2.268286477547386, + "grad_norm": 1.536889672279358, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8927749395370483, + "num_tokens": 680254696.0, + "step": 17831 + }, + { + "epoch": 2.2684136878259764, + "grad_norm": 1.6753337383270264, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8808984756469727, + "num_tokens": 680288978.0, + "step": 17832 + }, + { + "epoch": 2.268540898104567, + "grad_norm": 1.6442968845367432, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8794163465499878, + "num_tokens": 680322157.0, + "step": 17833 + }, + { + "epoch": 2.2686681083831575, + "grad_norm": 1.6567249298095703, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8834091424942017, + "num_tokens": 680356737.0, + "step": 17834 + }, + { + "epoch": 2.268795318661748, + "grad_norm": 1.4405407905578613, + "learning_rate": 1e-06, + "loss": 0.259, + "mean_token_accuracy": 0.905907154083252, + "num_tokens": 680392198.0, + "step": 17835 + }, + { + "epoch": 2.2689225289403385, + "grad_norm": 1.6051520109176636, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8837336897850037, + "num_tokens": 680429383.0, + "step": 17836 + }, + { + "epoch": 2.269049739218929, + "grad_norm": 1.615395188331604, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8872123956680298, + "num_tokens": 680464187.0, + "step": 17837 + }, + { + "epoch": 2.2691769494975196, + "grad_norm": 1.7312644720077515, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8879597187042236, + "num_tokens": 680495013.0, + "step": 17838 + }, + { + "epoch": 2.2693041597761097, + "grad_norm": 1.799185037612915, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8696789741516113, + "num_tokens": 680531278.0, + "step": 17839 + }, + { + "epoch": 2.2694313700547006, + "grad_norm": 1.5599671602249146, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8802059888839722, + "num_tokens": 680571469.0, + "step": 17840 + }, + { + "epoch": 2.2695585803332907, + "grad_norm": 1.5272971391677856, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.877951979637146, + "num_tokens": 680611876.0, + "step": 17841 + }, + { + "epoch": 2.2696857906118812, + "grad_norm": 1.5536314249038696, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8840376138687134, + "num_tokens": 680648444.0, + "step": 17842 + }, + { + "epoch": 2.2698130008904718, + "grad_norm": 1.6041938066482544, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8746702671051025, + "num_tokens": 680685747.0, + "step": 17843 + }, + { + "epoch": 2.2699402111690623, + "grad_norm": 1.5117143392562866, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8851216435432434, + "num_tokens": 680728302.0, + "step": 17844 + }, + { + "epoch": 2.270067421447653, + "grad_norm": 1.4822685718536377, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8954628109931946, + "num_tokens": 680766831.0, + "step": 17845 + }, + { + "epoch": 2.2701946317262434, + "grad_norm": 1.5280338525772095, + "learning_rate": 1e-06, + "loss": 0.2807, + "mean_token_accuracy": 0.9005149602890015, + "num_tokens": 680803497.0, + "step": 17846 + }, + { + "epoch": 2.270321842004834, + "grad_norm": 1.609294056892395, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8904871344566345, + "num_tokens": 680839498.0, + "step": 17847 + }, + { + "epoch": 2.2704490522834244, + "grad_norm": 1.5022042989730835, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8800057172775269, + "num_tokens": 680880174.0, + "step": 17848 + }, + { + "epoch": 2.270576262562015, + "grad_norm": 1.517676591873169, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.885838508605957, + "num_tokens": 680917537.0, + "step": 17849 + }, + { + "epoch": 2.2707034728406055, + "grad_norm": 1.5971732139587402, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8749922513961792, + "num_tokens": 680956681.0, + "step": 17850 + }, + { + "epoch": 2.270830683119196, + "grad_norm": 1.5509330034255981, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8832840919494629, + "num_tokens": 680993230.0, + "step": 17851 + }, + { + "epoch": 2.2709578933977865, + "grad_norm": 1.642919898033142, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8830324411392212, + "num_tokens": 681027571.0, + "step": 17852 + }, + { + "epoch": 2.271085103676377, + "grad_norm": 1.6427483558654785, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8809370994567871, + "num_tokens": 681065043.0, + "step": 17853 + }, + { + "epoch": 2.2712123139549676, + "grad_norm": 1.6780298948287964, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8669430017471313, + "num_tokens": 681104235.0, + "step": 17854 + }, + { + "epoch": 2.271339524233558, + "grad_norm": 1.6614229679107666, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8881069421768188, + "num_tokens": 681139407.0, + "step": 17855 + }, + { + "epoch": 2.2714667345121486, + "grad_norm": 1.6910336017608643, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8842340707778931, + "num_tokens": 681172521.0, + "step": 17856 + }, + { + "epoch": 2.271593944790739, + "grad_norm": 1.600437045097351, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.882830798625946, + "num_tokens": 681209148.0, + "step": 17857 + }, + { + "epoch": 2.2717211550693297, + "grad_norm": 1.4344854354858398, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8793702125549316, + "num_tokens": 681256239.0, + "step": 17858 + }, + { + "epoch": 2.2718483653479202, + "grad_norm": 1.6038776636123657, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8831285834312439, + "num_tokens": 681295192.0, + "step": 17859 + }, + { + "epoch": 2.2719755756265108, + "grad_norm": 1.5413621664047241, + "learning_rate": 1e-06, + "loss": 0.2832, + "mean_token_accuracy": 0.8982141017913818, + "num_tokens": 681332533.0, + "step": 17860 + }, + { + "epoch": 2.2721027859051013, + "grad_norm": 1.6518473625183105, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.890293300151825, + "num_tokens": 681373255.0, + "step": 17861 + }, + { + "epoch": 2.272229996183692, + "grad_norm": 1.6669762134552002, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8796244859695435, + "num_tokens": 681406193.0, + "step": 17862 + }, + { + "epoch": 2.2723572064622823, + "grad_norm": 1.5007163286209106, + "learning_rate": 1e-06, + "loss": 0.2885, + "mean_token_accuracy": 0.9011774659156799, + "num_tokens": 681444850.0, + "step": 17863 + }, + { + "epoch": 2.2724844167408724, + "grad_norm": 1.6224440336227417, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.9006409645080566, + "num_tokens": 681476301.0, + "step": 17864 + }, + { + "epoch": 2.2726116270194634, + "grad_norm": 1.4675742387771606, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.885324239730835, + "num_tokens": 681517487.0, + "step": 17865 + }, + { + "epoch": 2.2727388372980535, + "grad_norm": 1.5975314378738403, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8782516717910767, + "num_tokens": 681556535.0, + "step": 17866 + }, + { + "epoch": 2.272866047576644, + "grad_norm": 1.4914829730987549, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8847296237945557, + "num_tokens": 681599232.0, + "step": 17867 + }, + { + "epoch": 2.2729932578552345, + "grad_norm": 1.5550485849380493, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.884751558303833, + "num_tokens": 681636957.0, + "step": 17868 + }, + { + "epoch": 2.273120468133825, + "grad_norm": 1.5398221015930176, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8917657732963562, + "num_tokens": 681674366.0, + "step": 17869 + }, + { + "epoch": 2.2732476784124156, + "grad_norm": 1.5548762083053589, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8835395574569702, + "num_tokens": 681713388.0, + "step": 17870 + }, + { + "epoch": 2.273374888691006, + "grad_norm": 1.5162653923034668, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8864409923553467, + "num_tokens": 681753485.0, + "step": 17871 + }, + { + "epoch": 2.2735020989695967, + "grad_norm": 1.5661191940307617, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8835813999176025, + "num_tokens": 681795948.0, + "step": 17872 + }, + { + "epoch": 2.273629309248187, + "grad_norm": 1.6074943542480469, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8870991468429565, + "num_tokens": 681833819.0, + "step": 17873 + }, + { + "epoch": 2.2737565195267777, + "grad_norm": 1.684881329536438, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8837749361991882, + "num_tokens": 681868961.0, + "step": 17874 + }, + { + "epoch": 2.2738837298053682, + "grad_norm": 1.5341380834579468, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8886842727661133, + "num_tokens": 681906174.0, + "step": 17875 + }, + { + "epoch": 2.2740109400839588, + "grad_norm": 1.6025692224502563, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8813351392745972, + "num_tokens": 681944189.0, + "step": 17876 + }, + { + "epoch": 2.2741381503625493, + "grad_norm": 1.5198849439620972, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8915035128593445, + "num_tokens": 681983680.0, + "step": 17877 + }, + { + "epoch": 2.27426536064114, + "grad_norm": 1.5945905447006226, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8709869384765625, + "num_tokens": 682025767.0, + "step": 17878 + }, + { + "epoch": 2.2743925709197303, + "grad_norm": 1.5602774620056152, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8868439197540283, + "num_tokens": 682064266.0, + "step": 17879 + }, + { + "epoch": 2.274519781198321, + "grad_norm": 1.6120917797088623, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8720739483833313, + "num_tokens": 682105655.0, + "step": 17880 + }, + { + "epoch": 2.2746469914769114, + "grad_norm": 1.6434191465377808, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.8961160182952881, + "num_tokens": 682139859.0, + "step": 17881 + }, + { + "epoch": 2.274774201755502, + "grad_norm": 1.476979374885559, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8806525468826294, + "num_tokens": 682183238.0, + "step": 17882 + }, + { + "epoch": 2.2749014120340925, + "grad_norm": 1.5154755115509033, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8863502740859985, + "num_tokens": 682223273.0, + "step": 17883 + }, + { + "epoch": 2.275028622312683, + "grad_norm": 1.6341557502746582, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.883629322052002, + "num_tokens": 682260612.0, + "step": 17884 + }, + { + "epoch": 2.2751558325912735, + "grad_norm": 1.6480987071990967, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8725078105926514, + "num_tokens": 682301776.0, + "step": 17885 + }, + { + "epoch": 2.275283042869864, + "grad_norm": 1.5348942279815674, + "learning_rate": 1e-06, + "loss": 0.2648, + "mean_token_accuracy": 0.9039135575294495, + "num_tokens": 682336896.0, + "step": 17886 + }, + { + "epoch": 2.2754102531484546, + "grad_norm": 1.8511656522750854, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8797323107719421, + "num_tokens": 682366228.0, + "step": 17887 + }, + { + "epoch": 2.275537463427045, + "grad_norm": 1.4370304346084595, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8899351358413696, + "num_tokens": 682408624.0, + "step": 17888 + }, + { + "epoch": 2.275664673705635, + "grad_norm": 1.4976146221160889, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8894423842430115, + "num_tokens": 682449304.0, + "step": 17889 + }, + { + "epoch": 2.275791883984226, + "grad_norm": 1.5743188858032227, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8917806148529053, + "num_tokens": 682486107.0, + "step": 17890 + }, + { + "epoch": 2.2759190942628162, + "grad_norm": 1.4577142000198364, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8746607303619385, + "num_tokens": 682531943.0, + "step": 17891 + }, + { + "epoch": 2.2760463045414068, + "grad_norm": 1.469126582145691, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8943111300468445, + "num_tokens": 682572344.0, + "step": 17892 + }, + { + "epoch": 2.2761735148199973, + "grad_norm": 1.555495262145996, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8845638632774353, + "num_tokens": 682612512.0, + "step": 17893 + }, + { + "epoch": 2.276300725098588, + "grad_norm": 1.555478572845459, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8818888664245605, + "num_tokens": 682651449.0, + "step": 17894 + }, + { + "epoch": 2.2764279353771784, + "grad_norm": 1.6688753366470337, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8835048675537109, + "num_tokens": 682692578.0, + "step": 17895 + }, + { + "epoch": 2.276555145655769, + "grad_norm": 1.3931145668029785, + "learning_rate": 1e-06, + "loss": 0.2812, + "mean_token_accuracy": 0.8998002409934998, + "num_tokens": 682740413.0, + "step": 17896 + }, + { + "epoch": 2.2766823559343594, + "grad_norm": 1.534240484237671, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8805416226387024, + "num_tokens": 682781397.0, + "step": 17897 + }, + { + "epoch": 2.27680956621295, + "grad_norm": 1.4759865999221802, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8828417062759399, + "num_tokens": 682822182.0, + "step": 17898 + }, + { + "epoch": 2.2769367764915405, + "grad_norm": 1.6323999166488647, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8865042924880981, + "num_tokens": 682856271.0, + "step": 17899 + }, + { + "epoch": 2.277063986770131, + "grad_norm": 1.4184430837631226, + "learning_rate": 1e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.8943725824356079, + "num_tokens": 682901440.0, + "step": 17900 + }, + { + "epoch": 2.2771911970487215, + "grad_norm": 1.566811203956604, + "learning_rate": 1e-06, + "loss": 0.2737, + "mean_token_accuracy": 0.9008828401565552, + "num_tokens": 682932634.0, + "step": 17901 + }, + { + "epoch": 2.277318407327312, + "grad_norm": 1.6187649965286255, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.873015284538269, + "num_tokens": 682972029.0, + "step": 17902 + }, + { + "epoch": 2.2774456176059026, + "grad_norm": 1.519397497177124, + "learning_rate": 1e-06, + "loss": 0.2735, + "mean_token_accuracy": 0.9017435312271118, + "num_tokens": 683013142.0, + "step": 17903 + }, + { + "epoch": 2.277572827884493, + "grad_norm": 1.6050853729248047, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8887040019035339, + "num_tokens": 683049195.0, + "step": 17904 + }, + { + "epoch": 2.2777000381630836, + "grad_norm": 1.7771282196044922, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8817620277404785, + "num_tokens": 683085484.0, + "step": 17905 + }, + { + "epoch": 2.277827248441674, + "grad_norm": 1.5618895292282104, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8909541368484497, + "num_tokens": 683122811.0, + "step": 17906 + }, + { + "epoch": 2.2779544587202647, + "grad_norm": 1.8269872665405273, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8691240549087524, + "num_tokens": 683156838.0, + "step": 17907 + }, + { + "epoch": 2.2780816689988552, + "grad_norm": 1.6616435050964355, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8807581663131714, + "num_tokens": 683194179.0, + "step": 17908 + }, + { + "epoch": 2.2782088792774458, + "grad_norm": 1.620164394378662, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8756623864173889, + "num_tokens": 683233591.0, + "step": 17909 + }, + { + "epoch": 2.2783360895560363, + "grad_norm": 1.615405559539795, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8731123208999634, + "num_tokens": 683277376.0, + "step": 17910 + }, + { + "epoch": 2.278463299834627, + "grad_norm": 1.556453824043274, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8939975500106812, + "num_tokens": 683315556.0, + "step": 17911 + }, + { + "epoch": 2.2785905101132173, + "grad_norm": 1.5112541913986206, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8873888254165649, + "num_tokens": 683356168.0, + "step": 17912 + }, + { + "epoch": 2.278717720391808, + "grad_norm": 1.6372984647750854, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8743554353713989, + "num_tokens": 683396018.0, + "step": 17913 + }, + { + "epoch": 2.278844930670398, + "grad_norm": 1.6707929372787476, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8834192752838135, + "num_tokens": 683435287.0, + "step": 17914 + }, + { + "epoch": 2.278972140948989, + "grad_norm": 1.5373103618621826, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8893740177154541, + "num_tokens": 683477835.0, + "step": 17915 + }, + { + "epoch": 2.279099351227579, + "grad_norm": 1.554448127746582, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8788087368011475, + "num_tokens": 683516276.0, + "step": 17916 + }, + { + "epoch": 2.2792265615061695, + "grad_norm": 1.4984691143035889, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8874350786209106, + "num_tokens": 683555410.0, + "step": 17917 + }, + { + "epoch": 2.27935377178476, + "grad_norm": 1.7883838415145874, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.883988082408905, + "num_tokens": 683585236.0, + "step": 17918 + }, + { + "epoch": 2.2794809820633506, + "grad_norm": 1.577086091041565, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.875484824180603, + "num_tokens": 683624996.0, + "step": 17919 + }, + { + "epoch": 2.279608192341941, + "grad_norm": 1.667973518371582, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8881564736366272, + "num_tokens": 683657801.0, + "step": 17920 + }, + { + "epoch": 2.2797354026205316, + "grad_norm": 1.5441761016845703, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8848939538002014, + "num_tokens": 683696076.0, + "step": 17921 + }, + { + "epoch": 2.279862612899122, + "grad_norm": 1.5340243577957153, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8778475522994995, + "num_tokens": 683736001.0, + "step": 17922 + }, + { + "epoch": 2.2799898231777127, + "grad_norm": 1.6009727716445923, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8929314613342285, + "num_tokens": 683770422.0, + "step": 17923 + }, + { + "epoch": 2.2801170334563032, + "grad_norm": 1.6239142417907715, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8669719696044922, + "num_tokens": 683810389.0, + "step": 17924 + }, + { + "epoch": 2.2802442437348938, + "grad_norm": 1.6964795589447021, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8753619194030762, + "num_tokens": 683846532.0, + "step": 17925 + }, + { + "epoch": 2.2803714540134843, + "grad_norm": 1.6580337285995483, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8767038583755493, + "num_tokens": 683883246.0, + "step": 17926 + }, + { + "epoch": 2.280498664292075, + "grad_norm": 1.5497515201568604, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8788318634033203, + "num_tokens": 683923891.0, + "step": 17927 + }, + { + "epoch": 2.2806258745706653, + "grad_norm": 1.4989031553268433, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8868745565414429, + "num_tokens": 683964346.0, + "step": 17928 + }, + { + "epoch": 2.280753084849256, + "grad_norm": 1.4899590015411377, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8822413086891174, + "num_tokens": 684006434.0, + "step": 17929 + }, + { + "epoch": 2.2808802951278464, + "grad_norm": 1.4798206090927124, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8893954157829285, + "num_tokens": 684046967.0, + "step": 17930 + }, + { + "epoch": 2.281007505406437, + "grad_norm": 1.524568796157837, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8798014521598816, + "num_tokens": 684084186.0, + "step": 17931 + }, + { + "epoch": 2.2811347156850275, + "grad_norm": 1.546308994293213, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.895804762840271, + "num_tokens": 684119242.0, + "step": 17932 + }, + { + "epoch": 2.281261925963618, + "grad_norm": 1.477111577987671, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8695265650749207, + "num_tokens": 684162141.0, + "step": 17933 + }, + { + "epoch": 2.2813891362422085, + "grad_norm": 1.538534164428711, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8832455277442932, + "num_tokens": 684200148.0, + "step": 17934 + }, + { + "epoch": 2.281516346520799, + "grad_norm": 1.5850529670715332, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8907052278518677, + "num_tokens": 684235410.0, + "step": 17935 + }, + { + "epoch": 2.2816435567993896, + "grad_norm": 1.5433114767074585, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8894215822219849, + "num_tokens": 684273117.0, + "step": 17936 + }, + { + "epoch": 2.2817707670779797, + "grad_norm": 1.4658740758895874, + "learning_rate": 1e-06, + "loss": 0.2751, + "mean_token_accuracy": 0.8984825611114502, + "num_tokens": 684309707.0, + "step": 17937 + }, + { + "epoch": 2.2818979773565706, + "grad_norm": 1.378577470779419, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8944895267486572, + "num_tokens": 684355742.0, + "step": 17938 + }, + { + "epoch": 2.2820251876351607, + "grad_norm": 1.5686532258987427, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8778923749923706, + "num_tokens": 684393490.0, + "step": 17939 + }, + { + "epoch": 2.2821523979137512, + "grad_norm": 1.4859466552734375, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8829083442687988, + "num_tokens": 684431934.0, + "step": 17940 + }, + { + "epoch": 2.2822796081923418, + "grad_norm": 1.5274341106414795, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8836562633514404, + "num_tokens": 684469713.0, + "step": 17941 + }, + { + "epoch": 2.2824068184709323, + "grad_norm": 1.7136096954345703, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8843075037002563, + "num_tokens": 684502394.0, + "step": 17942 + }, + { + "epoch": 2.282534028749523, + "grad_norm": 1.584415078163147, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8808125257492065, + "num_tokens": 684541582.0, + "step": 17943 + }, + { + "epoch": 2.2826612390281134, + "grad_norm": 1.5377624034881592, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8823679685592651, + "num_tokens": 684581494.0, + "step": 17944 + }, + { + "epoch": 2.282788449306704, + "grad_norm": 1.4840366840362549, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8937420845031738, + "num_tokens": 684624294.0, + "step": 17945 + }, + { + "epoch": 2.2829156595852944, + "grad_norm": 1.5410733222961426, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8877080678939819, + "num_tokens": 684662574.0, + "step": 17946 + }, + { + "epoch": 2.283042869863885, + "grad_norm": 1.8330154418945312, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8811908960342407, + "num_tokens": 684698354.0, + "step": 17947 + }, + { + "epoch": 2.2831700801424755, + "grad_norm": 1.5066823959350586, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8815093040466309, + "num_tokens": 684741965.0, + "step": 17948 + }, + { + "epoch": 2.283297290421066, + "grad_norm": 1.527022123336792, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8963080644607544, + "num_tokens": 684781023.0, + "step": 17949 + }, + { + "epoch": 2.2834245006996565, + "grad_norm": 1.5433473587036133, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8848549127578735, + "num_tokens": 684821849.0, + "step": 17950 + }, + { + "epoch": 2.283551710978247, + "grad_norm": 1.662575602531433, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8814213275909424, + "num_tokens": 684857341.0, + "step": 17951 + }, + { + "epoch": 2.2836789212568376, + "grad_norm": 1.6389087438583374, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8773196935653687, + "num_tokens": 684895101.0, + "step": 17952 + }, + { + "epoch": 2.283806131535428, + "grad_norm": 1.7734489440917969, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8820251822471619, + "num_tokens": 684925231.0, + "step": 17953 + }, + { + "epoch": 2.2839333418140186, + "grad_norm": 1.5898444652557373, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8720568418502808, + "num_tokens": 684963548.0, + "step": 17954 + }, + { + "epoch": 2.284060552092609, + "grad_norm": 1.5442509651184082, + "learning_rate": 1e-06, + "loss": 0.2709, + "mean_token_accuracy": 0.9022040367126465, + "num_tokens": 684998521.0, + "step": 17955 + }, + { + "epoch": 2.2841877623711997, + "grad_norm": 1.6464930772781372, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8807355165481567, + "num_tokens": 685032580.0, + "step": 17956 + }, + { + "epoch": 2.28431497264979, + "grad_norm": 1.6229313611984253, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8892562389373779, + "num_tokens": 685066827.0, + "step": 17957 + }, + { + "epoch": 2.2844421829283807, + "grad_norm": 1.6533125638961792, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.877644419670105, + "num_tokens": 685103326.0, + "step": 17958 + }, + { + "epoch": 2.2845693932069713, + "grad_norm": 1.5017391443252563, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8940883874893188, + "num_tokens": 685142896.0, + "step": 17959 + }, + { + "epoch": 2.284696603485562, + "grad_norm": 1.5073299407958984, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8782428503036499, + "num_tokens": 685183157.0, + "step": 17960 + }, + { + "epoch": 2.2848238137641523, + "grad_norm": 1.6209499835968018, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8819177150726318, + "num_tokens": 685216525.0, + "step": 17961 + }, + { + "epoch": 2.2849510240427424, + "grad_norm": 1.522890567779541, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8931761980056763, + "num_tokens": 685253336.0, + "step": 17962 + }, + { + "epoch": 2.2850782343213334, + "grad_norm": 1.5626856088638306, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8813803195953369, + "num_tokens": 685292831.0, + "step": 17963 + }, + { + "epoch": 2.2852054445999235, + "grad_norm": 1.5478780269622803, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8828613758087158, + "num_tokens": 685332895.0, + "step": 17964 + }, + { + "epoch": 2.285332654878514, + "grad_norm": 1.5060852766036987, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8726169466972351, + "num_tokens": 685375251.0, + "step": 17965 + }, + { + "epoch": 2.2854598651571045, + "grad_norm": 1.4347385168075562, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8898043632507324, + "num_tokens": 685416787.0, + "step": 17966 + }, + { + "epoch": 2.285587075435695, + "grad_norm": 1.5582854747772217, + "learning_rate": 1e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.8977911472320557, + "num_tokens": 685454418.0, + "step": 17967 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 1.6770122051239014, + "learning_rate": 1e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.8985318541526794, + "num_tokens": 685484809.0, + "step": 17968 + }, + { + "epoch": 2.285841495992876, + "grad_norm": 1.5686988830566406, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8875690698623657, + "num_tokens": 685522985.0, + "step": 17969 + }, + { + "epoch": 2.2859687062714666, + "grad_norm": 1.5949604511260986, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8793506622314453, + "num_tokens": 685559107.0, + "step": 17970 + }, + { + "epoch": 2.286095916550057, + "grad_norm": 1.456931233406067, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8913853168487549, + "num_tokens": 685603025.0, + "step": 17971 + }, + { + "epoch": 2.2862231268286477, + "grad_norm": 1.7516535520553589, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8751152753829956, + "num_tokens": 685635783.0, + "step": 17972 + }, + { + "epoch": 2.2863503371072382, + "grad_norm": 1.6802927255630493, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8838051557540894, + "num_tokens": 685676461.0, + "step": 17973 + }, + { + "epoch": 2.2864775473858288, + "grad_norm": 1.5025745630264282, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8851766586303711, + "num_tokens": 685716158.0, + "step": 17974 + }, + { + "epoch": 2.2866047576644193, + "grad_norm": 1.4571987390518188, + "learning_rate": 1e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.89793461561203, + "num_tokens": 685754978.0, + "step": 17975 + }, + { + "epoch": 2.28673196794301, + "grad_norm": 1.538295865058899, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8561708927154541, + "num_tokens": 685799566.0, + "step": 17976 + }, + { + "epoch": 2.2868591782216003, + "grad_norm": 1.510366678237915, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8885493874549866, + "num_tokens": 685837087.0, + "step": 17977 + }, + { + "epoch": 2.286986388500191, + "grad_norm": 1.6192705631256104, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8764803409576416, + "num_tokens": 685872786.0, + "step": 17978 + }, + { + "epoch": 2.2871135987787814, + "grad_norm": 1.5704272985458374, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8936833739280701, + "num_tokens": 685912496.0, + "step": 17979 + }, + { + "epoch": 2.287240809057372, + "grad_norm": 1.5017743110656738, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8943447470664978, + "num_tokens": 685950442.0, + "step": 17980 + }, + { + "epoch": 2.2873680193359625, + "grad_norm": 1.5026719570159912, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.889318585395813, + "num_tokens": 685986885.0, + "step": 17981 + }, + { + "epoch": 2.287495229614553, + "grad_norm": 1.5077074766159058, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8934136033058167, + "num_tokens": 686025556.0, + "step": 17982 + }, + { + "epoch": 2.2876224398931435, + "grad_norm": 1.4795974493026733, + "learning_rate": 1e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.8957201242446899, + "num_tokens": 686061998.0, + "step": 17983 + }, + { + "epoch": 2.287749650171734, + "grad_norm": 1.6116305589675903, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8853083848953247, + "num_tokens": 686099256.0, + "step": 17984 + }, + { + "epoch": 2.2878768604503246, + "grad_norm": 1.7559095621109009, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8841198086738586, + "num_tokens": 686132053.0, + "step": 17985 + }, + { + "epoch": 2.288004070728915, + "grad_norm": 1.4922395944595337, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8861855268478394, + "num_tokens": 686170753.0, + "step": 17986 + }, + { + "epoch": 2.288131281007505, + "grad_norm": 1.4335880279541016, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.886755108833313, + "num_tokens": 686213851.0, + "step": 17987 + }, + { + "epoch": 2.288258491286096, + "grad_norm": 1.4937293529510498, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8782105445861816, + "num_tokens": 686256776.0, + "step": 17988 + }, + { + "epoch": 2.2883857015646862, + "grad_norm": 1.7050045728683472, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8827176690101624, + "num_tokens": 686296698.0, + "step": 17989 + }, + { + "epoch": 2.2885129118432768, + "grad_norm": 1.713119626045227, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8743541240692139, + "num_tokens": 686333758.0, + "step": 17990 + }, + { + "epoch": 2.2886401221218673, + "grad_norm": 1.4357898235321045, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8862024545669556, + "num_tokens": 686376806.0, + "step": 17991 + }, + { + "epoch": 2.288767332400458, + "grad_norm": 1.5239148139953613, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8861371874809265, + "num_tokens": 686418497.0, + "step": 17992 + }, + { + "epoch": 2.2888945426790483, + "grad_norm": 1.5850061178207397, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8846421837806702, + "num_tokens": 686456791.0, + "step": 17993 + }, + { + "epoch": 2.289021752957639, + "grad_norm": 1.659811019897461, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8858139514923096, + "num_tokens": 686491390.0, + "step": 17994 + }, + { + "epoch": 2.2891489632362294, + "grad_norm": 1.5315595865249634, + "learning_rate": 1e-06, + "loss": 0.2883, + "mean_token_accuracy": 0.8952471017837524, + "num_tokens": 686529047.0, + "step": 17995 + }, + { + "epoch": 2.28927617351482, + "grad_norm": 1.6303181648254395, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8829597234725952, + "num_tokens": 686568657.0, + "step": 17996 + }, + { + "epoch": 2.2894033837934105, + "grad_norm": 1.4714226722717285, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8843756914138794, + "num_tokens": 686610738.0, + "step": 17997 + }, + { + "epoch": 2.289530594072001, + "grad_norm": 1.5709009170532227, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8851777911186218, + "num_tokens": 686648875.0, + "step": 17998 + }, + { + "epoch": 2.2896578043505915, + "grad_norm": 1.5131947994232178, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8932454586029053, + "num_tokens": 686685432.0, + "step": 17999 + }, + { + "epoch": 2.289785014629182, + "grad_norm": 1.5152568817138672, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8795771598815918, + "num_tokens": 686726011.0, + "step": 18000 + }, + { + "epoch": 2.2899122249077726, + "grad_norm": 1.5729351043701172, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8834633827209473, + "num_tokens": 686766579.0, + "step": 18001 + }, + { + "epoch": 2.290039435186363, + "grad_norm": 1.5305134057998657, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8846796751022339, + "num_tokens": 686803859.0, + "step": 18002 + }, + { + "epoch": 2.2901666454649536, + "grad_norm": 1.5452749729156494, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8877716660499573, + "num_tokens": 686844304.0, + "step": 18003 + }, + { + "epoch": 2.290293855743544, + "grad_norm": 1.6461000442504883, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8867715001106262, + "num_tokens": 686878562.0, + "step": 18004 + }, + { + "epoch": 2.2904210660221347, + "grad_norm": 1.6021350622177124, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8951126337051392, + "num_tokens": 686912075.0, + "step": 18005 + }, + { + "epoch": 2.290548276300725, + "grad_norm": 1.4931278228759766, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.875486433506012, + "num_tokens": 686955598.0, + "step": 18006 + }, + { + "epoch": 2.2906754865793157, + "grad_norm": 1.5997681617736816, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8780196905136108, + "num_tokens": 686994231.0, + "step": 18007 + }, + { + "epoch": 2.2908026968579063, + "grad_norm": 1.6375164985656738, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8858469724655151, + "num_tokens": 687029495.0, + "step": 18008 + }, + { + "epoch": 2.290929907136497, + "grad_norm": 1.4560269117355347, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8929961919784546, + "num_tokens": 687070618.0, + "step": 18009 + }, + { + "epoch": 2.2910571174150873, + "grad_norm": 1.576657772064209, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8803949952125549, + "num_tokens": 687113842.0, + "step": 18010 + }, + { + "epoch": 2.291184327693678, + "grad_norm": 1.498818278312683, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8806723356246948, + "num_tokens": 687152110.0, + "step": 18011 + }, + { + "epoch": 2.291311537972268, + "grad_norm": 1.3771673440933228, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8918648958206177, + "num_tokens": 687192193.0, + "step": 18012 + }, + { + "epoch": 2.291438748250859, + "grad_norm": 1.4261552095413208, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8882662057876587, + "num_tokens": 687234000.0, + "step": 18013 + }, + { + "epoch": 2.291565958529449, + "grad_norm": 1.5484275817871094, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8805789947509766, + "num_tokens": 687277577.0, + "step": 18014 + }, + { + "epoch": 2.2916931688080395, + "grad_norm": 1.6048556566238403, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8857966065406799, + "num_tokens": 687314570.0, + "step": 18015 + }, + { + "epoch": 2.29182037908663, + "grad_norm": 1.5761655569076538, + "learning_rate": 1e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.9003744125366211, + "num_tokens": 687348431.0, + "step": 18016 + }, + { + "epoch": 2.2919475893652206, + "grad_norm": 1.5132801532745361, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8942810893058777, + "num_tokens": 687388860.0, + "step": 18017 + }, + { + "epoch": 2.292074799643811, + "grad_norm": 1.557903528213501, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8929201364517212, + "num_tokens": 687425773.0, + "step": 18018 + }, + { + "epoch": 2.2922020099224016, + "grad_norm": 1.6431289911270142, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8915407657623291, + "num_tokens": 687460691.0, + "step": 18019 + }, + { + "epoch": 2.292329220200992, + "grad_norm": 1.5300668478012085, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8892321586608887, + "num_tokens": 687497309.0, + "step": 18020 + }, + { + "epoch": 2.2924564304795827, + "grad_norm": 1.5931137800216675, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8806138038635254, + "num_tokens": 687535432.0, + "step": 18021 + }, + { + "epoch": 2.2925836407581732, + "grad_norm": 1.7805112600326538, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8596660494804382, + "num_tokens": 687571353.0, + "step": 18022 + }, + { + "epoch": 2.2927108510367638, + "grad_norm": 1.8230044841766357, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8863323330879211, + "num_tokens": 687605246.0, + "step": 18023 + }, + { + "epoch": 2.2928380613153543, + "grad_norm": 1.5921565294265747, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8689898252487183, + "num_tokens": 687646346.0, + "step": 18024 + }, + { + "epoch": 2.292965271593945, + "grad_norm": 1.4848668575286865, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8805494904518127, + "num_tokens": 687689795.0, + "step": 18025 + }, + { + "epoch": 2.2930924818725353, + "grad_norm": 1.5718090534210205, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8828036785125732, + "num_tokens": 687725401.0, + "step": 18026 + }, + { + "epoch": 2.293219692151126, + "grad_norm": 1.473300576210022, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8809103965759277, + "num_tokens": 687768704.0, + "step": 18027 + }, + { + "epoch": 2.2933469024297164, + "grad_norm": 1.6221798658370972, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8929527997970581, + "num_tokens": 687806929.0, + "step": 18028 + }, + { + "epoch": 2.293474112708307, + "grad_norm": 1.6555769443511963, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8844672441482544, + "num_tokens": 687843087.0, + "step": 18029 + }, + { + "epoch": 2.2936013229868975, + "grad_norm": 1.5839389562606812, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8710120916366577, + "num_tokens": 687883369.0, + "step": 18030 + }, + { + "epoch": 2.293728533265488, + "grad_norm": 1.5561654567718506, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.8973926305770874, + "num_tokens": 687919639.0, + "step": 18031 + }, + { + "epoch": 2.2938557435440785, + "grad_norm": 1.4078282117843628, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8928547501564026, + "num_tokens": 687965047.0, + "step": 18032 + }, + { + "epoch": 2.293982953822669, + "grad_norm": 1.6391407251358032, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8729945421218872, + "num_tokens": 687999871.0, + "step": 18033 + }, + { + "epoch": 2.2941101641012596, + "grad_norm": 1.5247234106063843, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8942733407020569, + "num_tokens": 688036006.0, + "step": 18034 + }, + { + "epoch": 2.2942373743798496, + "grad_norm": 1.7023637294769287, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8708263635635376, + "num_tokens": 688074350.0, + "step": 18035 + }, + { + "epoch": 2.2943645846584406, + "grad_norm": 1.5842483043670654, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8865809440612793, + "num_tokens": 688113031.0, + "step": 18036 + }, + { + "epoch": 2.2944917949370307, + "grad_norm": 1.5776747465133667, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8777741193771362, + "num_tokens": 688152046.0, + "step": 18037 + }, + { + "epoch": 2.2946190052156212, + "grad_norm": 1.8071515560150146, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8795690536499023, + "num_tokens": 688182036.0, + "step": 18038 + }, + { + "epoch": 2.2947462154942118, + "grad_norm": 1.5584512948989868, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8848132491111755, + "num_tokens": 688221250.0, + "step": 18039 + }, + { + "epoch": 2.2948734257728023, + "grad_norm": 1.5144356489181519, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8798964023590088, + "num_tokens": 688262307.0, + "step": 18040 + }, + { + "epoch": 2.295000636051393, + "grad_norm": 1.548317790031433, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8975353240966797, + "num_tokens": 688300670.0, + "step": 18041 + }, + { + "epoch": 2.2951278463299833, + "grad_norm": 1.7469637393951416, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8802197575569153, + "num_tokens": 688334702.0, + "step": 18042 + }, + { + "epoch": 2.295255056608574, + "grad_norm": 1.4319883584976196, + "learning_rate": 1e-06, + "loss": 0.283, + "mean_token_accuracy": 0.8990491628646851, + "num_tokens": 688373847.0, + "step": 18043 + }, + { + "epoch": 2.2953822668871644, + "grad_norm": 1.4569756984710693, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8881467580795288, + "num_tokens": 688415234.0, + "step": 18044 + }, + { + "epoch": 2.295509477165755, + "grad_norm": 1.6061497926712036, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8724995851516724, + "num_tokens": 688453120.0, + "step": 18045 + }, + { + "epoch": 2.2956366874443455, + "grad_norm": 1.6376103162765503, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8900306820869446, + "num_tokens": 688487916.0, + "step": 18046 + }, + { + "epoch": 2.295763897722936, + "grad_norm": 1.4307361841201782, + "learning_rate": 1e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.8950132727622986, + "num_tokens": 688532472.0, + "step": 18047 + }, + { + "epoch": 2.2958911080015265, + "grad_norm": 1.658795714378357, + "learning_rate": 1e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.8938454389572144, + "num_tokens": 688565369.0, + "step": 18048 + }, + { + "epoch": 2.296018318280117, + "grad_norm": 1.5579109191894531, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8896146416664124, + "num_tokens": 688600574.0, + "step": 18049 + }, + { + "epoch": 2.2961455285587076, + "grad_norm": 1.609773874282837, + "learning_rate": 1e-06, + "loss": 0.2755, + "mean_token_accuracy": 0.8982982635498047, + "num_tokens": 688633983.0, + "step": 18050 + }, + { + "epoch": 2.296272738837298, + "grad_norm": 1.6302410364151, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.886347770690918, + "num_tokens": 688672870.0, + "step": 18051 + }, + { + "epoch": 2.2963999491158886, + "grad_norm": 1.69404935836792, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8792123794555664, + "num_tokens": 688707252.0, + "step": 18052 + }, + { + "epoch": 2.296527159394479, + "grad_norm": 1.7410497665405273, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8872557878494263, + "num_tokens": 688740524.0, + "step": 18053 + }, + { + "epoch": 2.2966543696730697, + "grad_norm": 1.7285157442092896, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8799569606781006, + "num_tokens": 688775182.0, + "step": 18054 + }, + { + "epoch": 2.29678157995166, + "grad_norm": 1.5829219818115234, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8919066190719604, + "num_tokens": 688812370.0, + "step": 18055 + }, + { + "epoch": 2.2969087902302507, + "grad_norm": 1.6358014345169067, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8848474025726318, + "num_tokens": 688850529.0, + "step": 18056 + }, + { + "epoch": 2.2970360005088413, + "grad_norm": 1.7699668407440186, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8774648308753967, + "num_tokens": 688885093.0, + "step": 18057 + }, + { + "epoch": 2.297163210787432, + "grad_norm": 1.6626226902008057, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8865367770195007, + "num_tokens": 688919650.0, + "step": 18058 + }, + { + "epoch": 2.2972904210660223, + "grad_norm": 1.5737420320510864, + "learning_rate": 1e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.8939650058746338, + "num_tokens": 688959680.0, + "step": 18059 + }, + { + "epoch": 2.2974176313446124, + "grad_norm": 1.682390570640564, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8895070552825928, + "num_tokens": 688998005.0, + "step": 18060 + }, + { + "epoch": 2.2975448416232034, + "grad_norm": 1.5361499786376953, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.893092691898346, + "num_tokens": 689038354.0, + "step": 18061 + }, + { + "epoch": 2.2976720519017935, + "grad_norm": 1.5158668756484985, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8853530287742615, + "num_tokens": 689076813.0, + "step": 18062 + }, + { + "epoch": 2.297799262180384, + "grad_norm": 1.5360016822814941, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8911451697349548, + "num_tokens": 689115314.0, + "step": 18063 + }, + { + "epoch": 2.2979264724589745, + "grad_norm": 1.6119201183319092, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8840065002441406, + "num_tokens": 689152888.0, + "step": 18064 + }, + { + "epoch": 2.298053682737565, + "grad_norm": 1.5564154386520386, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8754271864891052, + "num_tokens": 689192949.0, + "step": 18065 + }, + { + "epoch": 2.2981808930161556, + "grad_norm": 1.5375325679779053, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8872693777084351, + "num_tokens": 689234931.0, + "step": 18066 + }, + { + "epoch": 2.298308103294746, + "grad_norm": 1.5209037065505981, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8840099573135376, + "num_tokens": 689275834.0, + "step": 18067 + }, + { + "epoch": 2.2984353135733366, + "grad_norm": 1.527101755142212, + "learning_rate": 1e-06, + "loss": 0.2771, + "mean_token_accuracy": 0.895897388458252, + "num_tokens": 689311891.0, + "step": 18068 + }, + { + "epoch": 2.298562523851927, + "grad_norm": 1.718409776687622, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8766991496086121, + "num_tokens": 689345812.0, + "step": 18069 + }, + { + "epoch": 2.2986897341305177, + "grad_norm": 1.6046926975250244, + "learning_rate": 1e-06, + "loss": 0.2746, + "mean_token_accuracy": 0.8985933661460876, + "num_tokens": 689377150.0, + "step": 18070 + }, + { + "epoch": 2.298816944409108, + "grad_norm": 1.5828567743301392, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8901592493057251, + "num_tokens": 689414366.0, + "step": 18071 + }, + { + "epoch": 2.2989441546876987, + "grad_norm": 1.496200442314148, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8615435361862183, + "num_tokens": 689458965.0, + "step": 18072 + }, + { + "epoch": 2.2990713649662893, + "grad_norm": 1.5767667293548584, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8903482556343079, + "num_tokens": 689494311.0, + "step": 18073 + }, + { + "epoch": 2.29919857524488, + "grad_norm": 1.6540285348892212, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8880781531333923, + "num_tokens": 689528379.0, + "step": 18074 + }, + { + "epoch": 2.2993257855234703, + "grad_norm": 1.5153553485870361, + "learning_rate": 1e-06, + "loss": 0.2768, + "mean_token_accuracy": 0.8992440700531006, + "num_tokens": 689563421.0, + "step": 18075 + }, + { + "epoch": 2.299452995802061, + "grad_norm": 1.7230256795883179, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8761410713195801, + "num_tokens": 689597568.0, + "step": 18076 + }, + { + "epoch": 2.2995802060806514, + "grad_norm": 1.5715231895446777, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8845032453536987, + "num_tokens": 689634768.0, + "step": 18077 + }, + { + "epoch": 2.299707416359242, + "grad_norm": 1.4823532104492188, + "learning_rate": 1e-06, + "loss": 0.2711, + "mean_token_accuracy": 0.9054873585700989, + "num_tokens": 689672422.0, + "step": 18078 + }, + { + "epoch": 2.2998346266378324, + "grad_norm": 1.4958503246307373, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8916339874267578, + "num_tokens": 689713925.0, + "step": 18079 + }, + { + "epoch": 2.299961836916423, + "grad_norm": 1.6276214122772217, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8891725540161133, + "num_tokens": 689747816.0, + "step": 18080 + }, + { + "epoch": 2.3000890471950135, + "grad_norm": 1.5887359380722046, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.8963370323181152, + "num_tokens": 689782978.0, + "step": 18081 + }, + { + "epoch": 2.300216257473604, + "grad_norm": 1.5101263523101807, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8790559768676758, + "num_tokens": 689823975.0, + "step": 18082 + }, + { + "epoch": 2.3003434677521946, + "grad_norm": 1.6606576442718506, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8875547647476196, + "num_tokens": 689855599.0, + "step": 18083 + }, + { + "epoch": 2.300470678030785, + "grad_norm": 1.665079116821289, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8784481287002563, + "num_tokens": 689891018.0, + "step": 18084 + }, + { + "epoch": 2.300597888309375, + "grad_norm": 1.6315968036651611, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8945094347000122, + "num_tokens": 689923756.0, + "step": 18085 + }, + { + "epoch": 2.300725098587966, + "grad_norm": 1.5525331497192383, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8860765695571899, + "num_tokens": 689961517.0, + "step": 18086 + }, + { + "epoch": 2.3008523088665562, + "grad_norm": 1.533875584602356, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8921964168548584, + "num_tokens": 689997987.0, + "step": 18087 + }, + { + "epoch": 2.3009795191451468, + "grad_norm": 1.5279786586761475, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8756217956542969, + "num_tokens": 690041836.0, + "step": 18088 + }, + { + "epoch": 2.3011067294237373, + "grad_norm": 1.6463699340820312, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8696753978729248, + "num_tokens": 690076792.0, + "step": 18089 + }, + { + "epoch": 2.301233939702328, + "grad_norm": 1.5648000240325928, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8793162703514099, + "num_tokens": 690116776.0, + "step": 18090 + }, + { + "epoch": 2.3013611499809183, + "grad_norm": 1.528972864151001, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8924540281295776, + "num_tokens": 690152716.0, + "step": 18091 + }, + { + "epoch": 2.301488360259509, + "grad_norm": 1.5708723068237305, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8799858689308167, + "num_tokens": 690193416.0, + "step": 18092 + }, + { + "epoch": 2.3016155705380994, + "grad_norm": 1.5935834646224976, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8816666603088379, + "num_tokens": 690236900.0, + "step": 18093 + }, + { + "epoch": 2.30174278081669, + "grad_norm": 1.4604814052581787, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8916118144989014, + "num_tokens": 690279945.0, + "step": 18094 + }, + { + "epoch": 2.3018699910952805, + "grad_norm": 1.5760362148284912, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.885939359664917, + "num_tokens": 690316086.0, + "step": 18095 + }, + { + "epoch": 2.301997201373871, + "grad_norm": 1.6318546533584595, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8915186524391174, + "num_tokens": 690352062.0, + "step": 18096 + }, + { + "epoch": 2.3021244116524615, + "grad_norm": 1.6353981494903564, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8827779293060303, + "num_tokens": 690388002.0, + "step": 18097 + }, + { + "epoch": 2.302251621931052, + "grad_norm": 1.5294265747070312, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8761830925941467, + "num_tokens": 690430523.0, + "step": 18098 + }, + { + "epoch": 2.3023788322096426, + "grad_norm": 1.520396113395691, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8809078931808472, + "num_tokens": 690469076.0, + "step": 18099 + }, + { + "epoch": 2.302506042488233, + "grad_norm": 1.6147339344024658, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8775438070297241, + "num_tokens": 690509237.0, + "step": 18100 + }, + { + "epoch": 2.3026332527668236, + "grad_norm": 1.483193039894104, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8840541839599609, + "num_tokens": 690550664.0, + "step": 18101 + }, + { + "epoch": 2.302760463045414, + "grad_norm": 1.6360549926757812, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.895672619342804, + "num_tokens": 690582529.0, + "step": 18102 + }, + { + "epoch": 2.3028876733240047, + "grad_norm": 1.4750816822052002, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8963806629180908, + "num_tokens": 690622097.0, + "step": 18103 + }, + { + "epoch": 2.303014883602595, + "grad_norm": 1.673635721206665, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8922384977340698, + "num_tokens": 690654813.0, + "step": 18104 + }, + { + "epoch": 2.3031420938811857, + "grad_norm": 1.5870945453643799, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8775666356086731, + "num_tokens": 690691971.0, + "step": 18105 + }, + { + "epoch": 2.3032693041597763, + "grad_norm": 1.6049226522445679, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8869925737380981, + "num_tokens": 690726695.0, + "step": 18106 + }, + { + "epoch": 2.303396514438367, + "grad_norm": 1.5711921453475952, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8999127745628357, + "num_tokens": 690760528.0, + "step": 18107 + }, + { + "epoch": 2.3035237247169573, + "grad_norm": 1.4658135175704956, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8815180063247681, + "num_tokens": 690802204.0, + "step": 18108 + }, + { + "epoch": 2.303650934995548, + "grad_norm": 1.4693641662597656, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8809967041015625, + "num_tokens": 690847059.0, + "step": 18109 + }, + { + "epoch": 2.303778145274138, + "grad_norm": 1.498012661933899, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.892729640007019, + "num_tokens": 690882110.0, + "step": 18110 + }, + { + "epoch": 2.303905355552729, + "grad_norm": 1.7610751390457153, + "learning_rate": 1e-06, + "loss": 0.2816, + "mean_token_accuracy": 0.8948118686676025, + "num_tokens": 690914578.0, + "step": 18111 + }, + { + "epoch": 2.304032565831319, + "grad_norm": 1.5107604265213013, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8827614784240723, + "num_tokens": 690954623.0, + "step": 18112 + }, + { + "epoch": 2.3041597761099095, + "grad_norm": 1.458929181098938, + "learning_rate": 1e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.8987610340118408, + "num_tokens": 690994835.0, + "step": 18113 + }, + { + "epoch": 2.3042869863885, + "grad_norm": 1.5478609800338745, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8797758221626282, + "num_tokens": 691034878.0, + "step": 18114 + }, + { + "epoch": 2.3044141966670906, + "grad_norm": 1.5766013860702515, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8847416043281555, + "num_tokens": 691072900.0, + "step": 18115 + }, + { + "epoch": 2.304541406945681, + "grad_norm": 1.4260623455047607, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.891925573348999, + "num_tokens": 691112191.0, + "step": 18116 + }, + { + "epoch": 2.3046686172242716, + "grad_norm": 1.5372849702835083, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8911522030830383, + "num_tokens": 691147821.0, + "step": 18117 + }, + { + "epoch": 2.304795827502862, + "grad_norm": 1.5859785079956055, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8912710547447205, + "num_tokens": 691183964.0, + "step": 18118 + }, + { + "epoch": 2.3049230377814527, + "grad_norm": 1.5920476913452148, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8871344923973083, + "num_tokens": 691220393.0, + "step": 18119 + }, + { + "epoch": 2.305050248060043, + "grad_norm": 1.6608407497406006, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8869105577468872, + "num_tokens": 691258283.0, + "step": 18120 + }, + { + "epoch": 2.3051774583386337, + "grad_norm": 1.544061303138733, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8862671256065369, + "num_tokens": 691297066.0, + "step": 18121 + }, + { + "epoch": 2.3053046686172243, + "grad_norm": 1.597883939743042, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8883303999900818, + "num_tokens": 691338043.0, + "step": 18122 + }, + { + "epoch": 2.305431878895815, + "grad_norm": 1.5227628946304321, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8863881826400757, + "num_tokens": 691379033.0, + "step": 18123 + }, + { + "epoch": 2.3055590891744053, + "grad_norm": 1.4245951175689697, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8872259855270386, + "num_tokens": 691423152.0, + "step": 18124 + }, + { + "epoch": 2.305686299452996, + "grad_norm": 1.6027586460113525, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8871849775314331, + "num_tokens": 691464906.0, + "step": 18125 + }, + { + "epoch": 2.3058135097315864, + "grad_norm": 1.659834623336792, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8851395845413208, + "num_tokens": 691499234.0, + "step": 18126 + }, + { + "epoch": 2.305940720010177, + "grad_norm": 1.5075862407684326, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8838213682174683, + "num_tokens": 691539376.0, + "step": 18127 + }, + { + "epoch": 2.3060679302887674, + "grad_norm": 1.5558940172195435, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8796939849853516, + "num_tokens": 691579115.0, + "step": 18128 + }, + { + "epoch": 2.306195140567358, + "grad_norm": 1.5571513175964355, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.89010089635849, + "num_tokens": 691614376.0, + "step": 18129 + }, + { + "epoch": 2.3063223508459485, + "grad_norm": 1.6367629766464233, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8843845129013062, + "num_tokens": 691650967.0, + "step": 18130 + }, + { + "epoch": 2.306449561124539, + "grad_norm": 1.554226040840149, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8860853910446167, + "num_tokens": 691690443.0, + "step": 18131 + }, + { + "epoch": 2.3065767714031296, + "grad_norm": 1.5846279859542847, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8936089277267456, + "num_tokens": 691726157.0, + "step": 18132 + }, + { + "epoch": 2.3067039816817196, + "grad_norm": 1.5571407079696655, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8914848566055298, + "num_tokens": 691763879.0, + "step": 18133 + }, + { + "epoch": 2.3068311919603106, + "grad_norm": 1.441321849822998, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8787423372268677, + "num_tokens": 691811499.0, + "step": 18134 + }, + { + "epoch": 2.3069584022389007, + "grad_norm": 1.6732507944107056, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8877861499786377, + "num_tokens": 691847174.0, + "step": 18135 + }, + { + "epoch": 2.3070856125174912, + "grad_norm": 1.540623664855957, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8895931243896484, + "num_tokens": 691887819.0, + "step": 18136 + }, + { + "epoch": 2.3072128227960818, + "grad_norm": 1.592117190361023, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8740596771240234, + "num_tokens": 691931205.0, + "step": 18137 + }, + { + "epoch": 2.3073400330746723, + "grad_norm": 1.5255464315414429, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8848480582237244, + "num_tokens": 691969625.0, + "step": 18138 + }, + { + "epoch": 2.307467243353263, + "grad_norm": 1.5230859518051147, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8842262625694275, + "num_tokens": 692010512.0, + "step": 18139 + }, + { + "epoch": 2.3075944536318533, + "grad_norm": 1.6033735275268555, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.890117883682251, + "num_tokens": 692049801.0, + "step": 18140 + }, + { + "epoch": 2.307721663910444, + "grad_norm": 1.5522081851959229, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8806014060974121, + "num_tokens": 692088460.0, + "step": 18141 + }, + { + "epoch": 2.3078488741890344, + "grad_norm": 1.6089520454406738, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8926616311073303, + "num_tokens": 692124027.0, + "step": 18142 + }, + { + "epoch": 2.307976084467625, + "grad_norm": 1.689491629600525, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8761123418807983, + "num_tokens": 692160286.0, + "step": 18143 + }, + { + "epoch": 2.3081032947462155, + "grad_norm": 1.5526329278945923, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8829470872879028, + "num_tokens": 692197312.0, + "step": 18144 + }, + { + "epoch": 2.308230505024806, + "grad_norm": 1.5242737531661987, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8786212801933289, + "num_tokens": 692236238.0, + "step": 18145 + }, + { + "epoch": 2.3083577153033965, + "grad_norm": 1.5711480379104614, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8647469282150269, + "num_tokens": 692278758.0, + "step": 18146 + }, + { + "epoch": 2.308484925581987, + "grad_norm": 1.5402976274490356, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8812927007675171, + "num_tokens": 692317180.0, + "step": 18147 + }, + { + "epoch": 2.3086121358605776, + "grad_norm": 1.6570225954055786, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8972576260566711, + "num_tokens": 692348571.0, + "step": 18148 + }, + { + "epoch": 2.308739346139168, + "grad_norm": 1.6542249917984009, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8854265213012695, + "num_tokens": 692386060.0, + "step": 18149 + }, + { + "epoch": 2.3088665564177586, + "grad_norm": 1.5274553298950195, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8839006423950195, + "num_tokens": 692423999.0, + "step": 18150 + }, + { + "epoch": 2.308993766696349, + "grad_norm": 1.6266067028045654, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8843883275985718, + "num_tokens": 692460001.0, + "step": 18151 + }, + { + "epoch": 2.3091209769749397, + "grad_norm": 1.5577483177185059, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8842601776123047, + "num_tokens": 692496966.0, + "step": 18152 + }, + { + "epoch": 2.30924818725353, + "grad_norm": 1.654334545135498, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8772327303886414, + "num_tokens": 692532680.0, + "step": 18153 + }, + { + "epoch": 2.3093753975321207, + "grad_norm": 1.5786857604980469, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8817243576049805, + "num_tokens": 692572256.0, + "step": 18154 + }, + { + "epoch": 2.3095026078107113, + "grad_norm": 1.5543287992477417, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8727525472640991, + "num_tokens": 692614709.0, + "step": 18155 + }, + { + "epoch": 2.309629818089302, + "grad_norm": 1.7884397506713867, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8838846683502197, + "num_tokens": 692643747.0, + "step": 18156 + }, + { + "epoch": 2.3097570283678923, + "grad_norm": 1.5178499221801758, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8878865242004395, + "num_tokens": 692682037.0, + "step": 18157 + }, + { + "epoch": 2.3098842386464824, + "grad_norm": 1.6513334512710571, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8723260164260864, + "num_tokens": 692721900.0, + "step": 18158 + }, + { + "epoch": 2.3100114489250734, + "grad_norm": 1.6805052757263184, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8831913471221924, + "num_tokens": 692756725.0, + "step": 18159 + }, + { + "epoch": 2.3101386592036635, + "grad_norm": 1.5005955696105957, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8819317817687988, + "num_tokens": 692799419.0, + "step": 18160 + }, + { + "epoch": 2.310265869482254, + "grad_norm": 1.5113669633865356, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8905506134033203, + "num_tokens": 692840705.0, + "step": 18161 + }, + { + "epoch": 2.3103930797608445, + "grad_norm": 1.8108655214309692, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8715251684188843, + "num_tokens": 692872298.0, + "step": 18162 + }, + { + "epoch": 2.310520290039435, + "grad_norm": 1.5038001537322998, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8904184699058533, + "num_tokens": 692909763.0, + "step": 18163 + }, + { + "epoch": 2.3106475003180256, + "grad_norm": 1.5807675123214722, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8694348335266113, + "num_tokens": 692950010.0, + "step": 18164 + }, + { + "epoch": 2.310774710596616, + "grad_norm": 1.6706910133361816, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8748669028282166, + "num_tokens": 692988366.0, + "step": 18165 + }, + { + "epoch": 2.3109019208752066, + "grad_norm": 1.4016717672348022, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8878711462020874, + "num_tokens": 693034533.0, + "step": 18166 + }, + { + "epoch": 2.311029131153797, + "grad_norm": 1.5091519355773926, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8792539238929749, + "num_tokens": 693075552.0, + "step": 18167 + }, + { + "epoch": 2.3111563414323877, + "grad_norm": 1.533953070640564, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8708873391151428, + "num_tokens": 693119503.0, + "step": 18168 + }, + { + "epoch": 2.311283551710978, + "grad_norm": 1.645873785018921, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.886486291885376, + "num_tokens": 693157255.0, + "step": 18169 + }, + { + "epoch": 2.3114107619895687, + "grad_norm": 1.7351402044296265, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8710596561431885, + "num_tokens": 693195031.0, + "step": 18170 + }, + { + "epoch": 2.3115379722681593, + "grad_norm": 1.5808812379837036, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8854584693908691, + "num_tokens": 693229741.0, + "step": 18171 + }, + { + "epoch": 2.31166518254675, + "grad_norm": 1.491153597831726, + "learning_rate": 1e-06, + "loss": 0.2704, + "mean_token_accuracy": 0.9019016623497009, + "num_tokens": 693266744.0, + "step": 18172 + }, + { + "epoch": 2.3117923928253403, + "grad_norm": 1.6050835847854614, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8896989822387695, + "num_tokens": 693299880.0, + "step": 18173 + }, + { + "epoch": 2.311919603103931, + "grad_norm": 1.667654275894165, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8884278535842896, + "num_tokens": 693339229.0, + "step": 18174 + }, + { + "epoch": 2.3120468133825214, + "grad_norm": 1.3107296228408813, + "learning_rate": 1e-06, + "loss": 0.2838, + "mean_token_accuracy": 0.8982334733009338, + "num_tokens": 693385573.0, + "step": 18175 + }, + { + "epoch": 2.312174023661112, + "grad_norm": 1.595525860786438, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8870015144348145, + "num_tokens": 693421317.0, + "step": 18176 + }, + { + "epoch": 2.3123012339397024, + "grad_norm": 1.620137095451355, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8940654993057251, + "num_tokens": 693455792.0, + "step": 18177 + }, + { + "epoch": 2.312428444218293, + "grad_norm": 1.5710930824279785, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8794875741004944, + "num_tokens": 693496022.0, + "step": 18178 + }, + { + "epoch": 2.3125556544968835, + "grad_norm": 1.6835886240005493, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8871389627456665, + "num_tokens": 693528280.0, + "step": 18179 + }, + { + "epoch": 2.312682864775474, + "grad_norm": 1.6319952011108398, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8857542276382446, + "num_tokens": 693566046.0, + "step": 18180 + }, + { + "epoch": 2.3128100750540646, + "grad_norm": 1.6130727529525757, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8873648643493652, + "num_tokens": 693600197.0, + "step": 18181 + }, + { + "epoch": 2.312937285332655, + "grad_norm": 1.7606886625289917, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8746112585067749, + "num_tokens": 693635546.0, + "step": 18182 + }, + { + "epoch": 2.313064495611245, + "grad_norm": 1.635256290435791, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8637338876724243, + "num_tokens": 693678964.0, + "step": 18183 + }, + { + "epoch": 2.313191705889836, + "grad_norm": 1.4992574453353882, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8846566677093506, + "num_tokens": 693720919.0, + "step": 18184 + }, + { + "epoch": 2.313318916168426, + "grad_norm": 1.45816969871521, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8888195157051086, + "num_tokens": 693762756.0, + "step": 18185 + }, + { + "epoch": 2.3134461264470167, + "grad_norm": 1.442000389099121, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.8978733420372009, + "num_tokens": 693803722.0, + "step": 18186 + }, + { + "epoch": 2.3135733367256073, + "grad_norm": 1.522597312927246, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.887273907661438, + "num_tokens": 693845057.0, + "step": 18187 + }, + { + "epoch": 2.313700547004198, + "grad_norm": 1.7288570404052734, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8927962779998779, + "num_tokens": 693881549.0, + "step": 18188 + }, + { + "epoch": 2.3138277572827883, + "grad_norm": 1.4903302192687988, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8898537755012512, + "num_tokens": 693920736.0, + "step": 18189 + }, + { + "epoch": 2.313954967561379, + "grad_norm": 1.5700260400772095, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8827687501907349, + "num_tokens": 693961431.0, + "step": 18190 + }, + { + "epoch": 2.3140821778399694, + "grad_norm": 1.6244158744812012, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8709354400634766, + "num_tokens": 694001545.0, + "step": 18191 + }, + { + "epoch": 2.31420938811856, + "grad_norm": 1.4912630319595337, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8816611766815186, + "num_tokens": 694044457.0, + "step": 18192 + }, + { + "epoch": 2.3143365983971504, + "grad_norm": 1.5408576726913452, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8790243864059448, + "num_tokens": 694085646.0, + "step": 18193 + }, + { + "epoch": 2.314463808675741, + "grad_norm": 1.4452170133590698, + "learning_rate": 1e-06, + "loss": 0.2778, + "mean_token_accuracy": 0.8983772993087769, + "num_tokens": 694125446.0, + "step": 18194 + }, + { + "epoch": 2.3145910189543315, + "grad_norm": 1.4971696138381958, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8933191299438477, + "num_tokens": 694163317.0, + "step": 18195 + }, + { + "epoch": 2.314718229232922, + "grad_norm": 1.5066280364990234, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8847010135650635, + "num_tokens": 694201605.0, + "step": 18196 + }, + { + "epoch": 2.3148454395115126, + "grad_norm": 1.486158013343811, + "learning_rate": 1e-06, + "loss": 0.2735, + "mean_token_accuracy": 0.896763801574707, + "num_tokens": 694239758.0, + "step": 18197 + }, + { + "epoch": 2.314972649790103, + "grad_norm": 1.7676459550857544, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8632577657699585, + "num_tokens": 694276271.0, + "step": 18198 + }, + { + "epoch": 2.3150998600686936, + "grad_norm": 1.550418496131897, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8765286207199097, + "num_tokens": 694317082.0, + "step": 18199 + }, + { + "epoch": 2.315227070347284, + "grad_norm": 1.5747087001800537, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8887360095977783, + "num_tokens": 694356458.0, + "step": 18200 + }, + { + "epoch": 2.3153542806258747, + "grad_norm": 1.6018184423446655, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8840727806091309, + "num_tokens": 694399297.0, + "step": 18201 + }, + { + "epoch": 2.315481490904465, + "grad_norm": 1.5339032411575317, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8885226249694824, + "num_tokens": 694437380.0, + "step": 18202 + }, + { + "epoch": 2.3156087011830557, + "grad_norm": 1.6731562614440918, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8808637857437134, + "num_tokens": 694477959.0, + "step": 18203 + }, + { + "epoch": 2.3157359114616463, + "grad_norm": 1.528694748878479, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.878638744354248, + "num_tokens": 694522976.0, + "step": 18204 + }, + { + "epoch": 2.315863121740237, + "grad_norm": 1.568618655204773, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8942851424217224, + "num_tokens": 694558695.0, + "step": 18205 + }, + { + "epoch": 2.3159903320188273, + "grad_norm": 1.526336908340454, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.890845775604248, + "num_tokens": 694596251.0, + "step": 18206 + }, + { + "epoch": 2.316117542297418, + "grad_norm": 1.7334991693496704, + "learning_rate": 1e-06, + "loss": 0.2699, + "mean_token_accuracy": 0.9022419452667236, + "num_tokens": 694632620.0, + "step": 18207 + }, + { + "epoch": 2.316244752576008, + "grad_norm": 1.4803534746170044, + "learning_rate": 1e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.8962161540985107, + "num_tokens": 694671955.0, + "step": 18208 + }, + { + "epoch": 2.316371962854599, + "grad_norm": 1.7350959777832031, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.887238621711731, + "num_tokens": 694703409.0, + "step": 18209 + }, + { + "epoch": 2.316499173133189, + "grad_norm": 1.5034470558166504, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.881059467792511, + "num_tokens": 694745959.0, + "step": 18210 + }, + { + "epoch": 2.3166263834117795, + "grad_norm": 1.6025409698486328, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8814265727996826, + "num_tokens": 694780506.0, + "step": 18211 + }, + { + "epoch": 2.31675359369037, + "grad_norm": 1.8205864429473877, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8638088703155518, + "num_tokens": 694813533.0, + "step": 18212 + }, + { + "epoch": 2.3168808039689606, + "grad_norm": 1.8019994497299194, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8800684213638306, + "num_tokens": 694847906.0, + "step": 18213 + }, + { + "epoch": 2.317008014247551, + "grad_norm": 1.6753062009811401, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8850802779197693, + "num_tokens": 694883872.0, + "step": 18214 + }, + { + "epoch": 2.3171352245261416, + "grad_norm": 1.4180539846420288, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8893067836761475, + "num_tokens": 694925434.0, + "step": 18215 + }, + { + "epoch": 2.317262434804732, + "grad_norm": 1.5231878757476807, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8834208250045776, + "num_tokens": 694963704.0, + "step": 18216 + }, + { + "epoch": 2.3173896450833227, + "grad_norm": 1.4560046195983887, + "learning_rate": 1e-06, + "loss": 0.2782, + "mean_token_accuracy": 0.8993545770645142, + "num_tokens": 695001819.0, + "step": 18217 + }, + { + "epoch": 2.317516855361913, + "grad_norm": 1.4808460474014282, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8937691450119019, + "num_tokens": 695040581.0, + "step": 18218 + }, + { + "epoch": 2.3176440656405037, + "grad_norm": 1.5131820440292358, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8861929178237915, + "num_tokens": 695081853.0, + "step": 18219 + }, + { + "epoch": 2.3177712759190943, + "grad_norm": 1.576106071472168, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8735387325286865, + "num_tokens": 695121148.0, + "step": 18220 + }, + { + "epoch": 2.317898486197685, + "grad_norm": 1.6370947360992432, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8852609395980835, + "num_tokens": 695154108.0, + "step": 18221 + }, + { + "epoch": 2.3180256964762753, + "grad_norm": 1.6297297477722168, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8896631002426147, + "num_tokens": 695189257.0, + "step": 18222 + }, + { + "epoch": 2.318152906754866, + "grad_norm": 1.5708637237548828, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8857250213623047, + "num_tokens": 695230900.0, + "step": 18223 + }, + { + "epoch": 2.3182801170334564, + "grad_norm": 1.565250039100647, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.8955705165863037, + "num_tokens": 695265514.0, + "step": 18224 + }, + { + "epoch": 2.318407327312047, + "grad_norm": 1.811698317527771, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8737462759017944, + "num_tokens": 695299055.0, + "step": 18225 + }, + { + "epoch": 2.3185345375906374, + "grad_norm": 1.639247179031372, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8934905529022217, + "num_tokens": 695331197.0, + "step": 18226 + }, + { + "epoch": 2.318661747869228, + "grad_norm": 1.564800500869751, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8899542689323425, + "num_tokens": 695370497.0, + "step": 18227 + }, + { + "epoch": 2.3187889581478185, + "grad_norm": 1.533105731010437, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8764362335205078, + "num_tokens": 695411329.0, + "step": 18228 + }, + { + "epoch": 2.318916168426409, + "grad_norm": 1.4915417432785034, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8917075395584106, + "num_tokens": 695450633.0, + "step": 18229 + }, + { + "epoch": 2.3190433787049995, + "grad_norm": 1.469712734222412, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.890994668006897, + "num_tokens": 695496048.0, + "step": 18230 + }, + { + "epoch": 2.3191705889835896, + "grad_norm": 1.6429579257965088, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8828040361404419, + "num_tokens": 695531482.0, + "step": 18231 + }, + { + "epoch": 2.3192977992621806, + "grad_norm": 1.5873405933380127, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8865633010864258, + "num_tokens": 695567349.0, + "step": 18232 + }, + { + "epoch": 2.3194250095407707, + "grad_norm": 1.6418545246124268, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8891072869300842, + "num_tokens": 695604607.0, + "step": 18233 + }, + { + "epoch": 2.319552219819361, + "grad_norm": 1.513251781463623, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.8963315486907959, + "num_tokens": 695640158.0, + "step": 18234 + }, + { + "epoch": 2.3196794300979517, + "grad_norm": 1.5722626447677612, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8919495344161987, + "num_tokens": 695677502.0, + "step": 18235 + }, + { + "epoch": 2.3198066403765423, + "grad_norm": 1.5898953676223755, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8831405639648438, + "num_tokens": 695715837.0, + "step": 18236 + }, + { + "epoch": 2.319933850655133, + "grad_norm": 1.6195268630981445, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8871937990188599, + "num_tokens": 695750697.0, + "step": 18237 + }, + { + "epoch": 2.3200610609337233, + "grad_norm": 1.5431517362594604, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8834471702575684, + "num_tokens": 695787019.0, + "step": 18238 + }, + { + "epoch": 2.320188271212314, + "grad_norm": 1.7089951038360596, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8779707551002502, + "num_tokens": 695824269.0, + "step": 18239 + }, + { + "epoch": 2.3203154814909044, + "grad_norm": 1.4683418273925781, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8870148658752441, + "num_tokens": 695862828.0, + "step": 18240 + }, + { + "epoch": 2.320442691769495, + "grad_norm": 1.5198079347610474, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8914973735809326, + "num_tokens": 695900232.0, + "step": 18241 + }, + { + "epoch": 2.3205699020480854, + "grad_norm": 1.5404192209243774, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8879106044769287, + "num_tokens": 695941088.0, + "step": 18242 + }, + { + "epoch": 2.320697112326676, + "grad_norm": 1.4249507188796997, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8901140689849854, + "num_tokens": 695983782.0, + "step": 18243 + }, + { + "epoch": 2.3208243226052665, + "grad_norm": 1.5462356805801392, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8840717077255249, + "num_tokens": 696022462.0, + "step": 18244 + }, + { + "epoch": 2.320951532883857, + "grad_norm": 1.563443660736084, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8716071844100952, + "num_tokens": 696060705.0, + "step": 18245 + }, + { + "epoch": 2.3210787431624476, + "grad_norm": 1.6866167783737183, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8936557769775391, + "num_tokens": 696095644.0, + "step": 18246 + }, + { + "epoch": 2.321205953441038, + "grad_norm": 1.630238652229309, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8902767300605774, + "num_tokens": 696129145.0, + "step": 18247 + }, + { + "epoch": 2.3213331637196286, + "grad_norm": 1.48259437084198, + "learning_rate": 1e-06, + "loss": 0.2765, + "mean_token_accuracy": 0.8973104953765869, + "num_tokens": 696170121.0, + "step": 18248 + }, + { + "epoch": 2.321460373998219, + "grad_norm": 1.822651982307434, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8815878629684448, + "num_tokens": 696199462.0, + "step": 18249 + }, + { + "epoch": 2.3215875842768097, + "grad_norm": 1.6358025074005127, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8893269300460815, + "num_tokens": 696234779.0, + "step": 18250 + }, + { + "epoch": 2.3217147945554, + "grad_norm": 1.4606000185012817, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8927680253982544, + "num_tokens": 696275634.0, + "step": 18251 + }, + { + "epoch": 2.3218420048339907, + "grad_norm": 1.470828890800476, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8976334929466248, + "num_tokens": 696314893.0, + "step": 18252 + }, + { + "epoch": 2.3219692151125813, + "grad_norm": 1.5780593156814575, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8798345327377319, + "num_tokens": 696353191.0, + "step": 18253 + }, + { + "epoch": 2.322096425391172, + "grad_norm": 1.6649367809295654, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8891956210136414, + "num_tokens": 696384465.0, + "step": 18254 + }, + { + "epoch": 2.3222236356697623, + "grad_norm": 1.7780842781066895, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8834125399589539, + "num_tokens": 696417410.0, + "step": 18255 + }, + { + "epoch": 2.3223508459483524, + "grad_norm": 1.5293880701065063, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8983621597290039, + "num_tokens": 696454207.0, + "step": 18256 + }, + { + "epoch": 2.3224780562269434, + "grad_norm": 1.4677529335021973, + "learning_rate": 1e-06, + "loss": 0.2841, + "mean_token_accuracy": 0.8935306668281555, + "num_tokens": 696492916.0, + "step": 18257 + }, + { + "epoch": 2.3226052665055335, + "grad_norm": 1.647200107574463, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8862069845199585, + "num_tokens": 696526537.0, + "step": 18258 + }, + { + "epoch": 2.322732476784124, + "grad_norm": 1.6347362995147705, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8780068159103394, + "num_tokens": 696563006.0, + "step": 18259 + }, + { + "epoch": 2.3228596870627145, + "grad_norm": 1.6504051685333252, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.890052855014801, + "num_tokens": 696597309.0, + "step": 18260 + }, + { + "epoch": 2.322986897341305, + "grad_norm": 1.6319594383239746, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8838294744491577, + "num_tokens": 696634787.0, + "step": 18261 + }, + { + "epoch": 2.3231141076198956, + "grad_norm": 1.6912448406219482, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8826976418495178, + "num_tokens": 696667688.0, + "step": 18262 + }, + { + "epoch": 2.323241317898486, + "grad_norm": 1.4714229106903076, + "learning_rate": 1e-06, + "loss": 0.2725, + "mean_token_accuracy": 0.9009460210800171, + "num_tokens": 696704278.0, + "step": 18263 + }, + { + "epoch": 2.3233685281770766, + "grad_norm": 1.4879096746444702, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8838827013969421, + "num_tokens": 696745325.0, + "step": 18264 + }, + { + "epoch": 2.323495738455667, + "grad_norm": 1.597199559211731, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8846673965454102, + "num_tokens": 696786162.0, + "step": 18265 + }, + { + "epoch": 2.3236229487342577, + "grad_norm": 1.5547685623168945, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8945549726486206, + "num_tokens": 696821055.0, + "step": 18266 + }, + { + "epoch": 2.323750159012848, + "grad_norm": 1.594739317893982, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.887510359287262, + "num_tokens": 696859552.0, + "step": 18267 + }, + { + "epoch": 2.3238773692914387, + "grad_norm": 1.4888198375701904, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8880759477615356, + "num_tokens": 696900319.0, + "step": 18268 + }, + { + "epoch": 2.3240045795700293, + "grad_norm": 1.4492082595825195, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8898782730102539, + "num_tokens": 696942890.0, + "step": 18269 + }, + { + "epoch": 2.32413178984862, + "grad_norm": 1.5024937391281128, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.884053111076355, + "num_tokens": 696983482.0, + "step": 18270 + }, + { + "epoch": 2.3242590001272103, + "grad_norm": 1.5246915817260742, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8962123394012451, + "num_tokens": 697025076.0, + "step": 18271 + }, + { + "epoch": 2.324386210405801, + "grad_norm": 1.527458906173706, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8834888339042664, + "num_tokens": 697066324.0, + "step": 18272 + }, + { + "epoch": 2.3245134206843914, + "grad_norm": 1.5142663717269897, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8824082016944885, + "num_tokens": 697102973.0, + "step": 18273 + }, + { + "epoch": 2.324640630962982, + "grad_norm": 1.4728659391403198, + "learning_rate": 1e-06, + "loss": 0.2712, + "mean_token_accuracy": 0.89963299036026, + "num_tokens": 697138357.0, + "step": 18274 + }, + { + "epoch": 2.3247678412415724, + "grad_norm": 1.4583338499069214, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8851103782653809, + "num_tokens": 697179935.0, + "step": 18275 + }, + { + "epoch": 2.324895051520163, + "grad_norm": 1.4668207168579102, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8970781564712524, + "num_tokens": 697219284.0, + "step": 18276 + }, + { + "epoch": 2.3250222617987535, + "grad_norm": 1.507597804069519, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8889568448066711, + "num_tokens": 697257817.0, + "step": 18277 + }, + { + "epoch": 2.325149472077344, + "grad_norm": 1.4893287420272827, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8949445486068726, + "num_tokens": 697297234.0, + "step": 18278 + }, + { + "epoch": 2.3252766823559345, + "grad_norm": 1.5550090074539185, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8806021213531494, + "num_tokens": 697337080.0, + "step": 18279 + }, + { + "epoch": 2.325403892634525, + "grad_norm": 1.6008416414260864, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8962937593460083, + "num_tokens": 697369816.0, + "step": 18280 + }, + { + "epoch": 2.325531102913115, + "grad_norm": 1.5543828010559082, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8770818114280701, + "num_tokens": 697410506.0, + "step": 18281 + }, + { + "epoch": 2.325658313191706, + "grad_norm": 1.536862850189209, + "learning_rate": 1e-06, + "loss": 0.2776, + "mean_token_accuracy": 0.8980176448822021, + "num_tokens": 697445180.0, + "step": 18282 + }, + { + "epoch": 2.325785523470296, + "grad_norm": 1.613120436668396, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8702546954154968, + "num_tokens": 697485994.0, + "step": 18283 + }, + { + "epoch": 2.3259127337488867, + "grad_norm": 1.479720950126648, + "learning_rate": 1e-06, + "loss": 0.2512, + "mean_token_accuracy": 0.9069420695304871, + "num_tokens": 697521267.0, + "step": 18284 + }, + { + "epoch": 2.3260399440274773, + "grad_norm": 1.7153419256210327, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8808035850524902, + "num_tokens": 697554008.0, + "step": 18285 + }, + { + "epoch": 2.326167154306068, + "grad_norm": 1.444869875907898, + "learning_rate": 1e-06, + "loss": 0.2808, + "mean_token_accuracy": 0.8979933261871338, + "num_tokens": 697592020.0, + "step": 18286 + }, + { + "epoch": 2.3262943645846583, + "grad_norm": 1.553853154182434, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8792352676391602, + "num_tokens": 697634449.0, + "step": 18287 + }, + { + "epoch": 2.326421574863249, + "grad_norm": 1.604533314704895, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8817948698997498, + "num_tokens": 697674231.0, + "step": 18288 + }, + { + "epoch": 2.3265487851418394, + "grad_norm": 1.4751509428024292, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8805599212646484, + "num_tokens": 697716358.0, + "step": 18289 + }, + { + "epoch": 2.32667599542043, + "grad_norm": 1.4515223503112793, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8875633478164673, + "num_tokens": 697756942.0, + "step": 18290 + }, + { + "epoch": 2.3268032056990204, + "grad_norm": 1.5551271438598633, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8842589259147644, + "num_tokens": 697796131.0, + "step": 18291 + }, + { + "epoch": 2.326930415977611, + "grad_norm": 1.626618504524231, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8936203718185425, + "num_tokens": 697832691.0, + "step": 18292 + }, + { + "epoch": 2.3270576262562015, + "grad_norm": 1.6248717308044434, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8914905786514282, + "num_tokens": 697867419.0, + "step": 18293 + }, + { + "epoch": 2.327184836534792, + "grad_norm": 1.5503480434417725, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8875156044960022, + "num_tokens": 697907386.0, + "step": 18294 + }, + { + "epoch": 2.3273120468133826, + "grad_norm": 1.6222662925720215, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8840240836143494, + "num_tokens": 697945958.0, + "step": 18295 + }, + { + "epoch": 2.327439257091973, + "grad_norm": 1.7174168825149536, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8647222518920898, + "num_tokens": 697983771.0, + "step": 18296 + }, + { + "epoch": 2.3275664673705636, + "grad_norm": 1.547955870628357, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8826337456703186, + "num_tokens": 698021716.0, + "step": 18297 + }, + { + "epoch": 2.327693677649154, + "grad_norm": 1.5084877014160156, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8788657784461975, + "num_tokens": 698063143.0, + "step": 18298 + }, + { + "epoch": 2.3278208879277447, + "grad_norm": 1.6157029867172241, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8858243227005005, + "num_tokens": 698102389.0, + "step": 18299 + }, + { + "epoch": 2.327948098206335, + "grad_norm": 1.6569154262542725, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8776164054870605, + "num_tokens": 698139898.0, + "step": 18300 + }, + { + "epoch": 2.3280753084849257, + "grad_norm": 1.716200828552246, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8771378993988037, + "num_tokens": 698177486.0, + "step": 18301 + }, + { + "epoch": 2.3282025187635162, + "grad_norm": 1.619371771812439, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8878992795944214, + "num_tokens": 698212207.0, + "step": 18302 + }, + { + "epoch": 2.3283297290421068, + "grad_norm": 1.610297441482544, + "learning_rate": 1e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.8962779641151428, + "num_tokens": 698247782.0, + "step": 18303 + }, + { + "epoch": 2.3284569393206973, + "grad_norm": 1.5546205043792725, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8869468569755554, + "num_tokens": 698285229.0, + "step": 18304 + }, + { + "epoch": 2.328584149599288, + "grad_norm": 1.5485707521438599, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8884095549583435, + "num_tokens": 698321738.0, + "step": 18305 + }, + { + "epoch": 2.328711359877878, + "grad_norm": 1.4303476810455322, + "learning_rate": 1e-06, + "loss": 0.2796, + "mean_token_accuracy": 0.8996966481208801, + "num_tokens": 698361221.0, + "step": 18306 + }, + { + "epoch": 2.328838570156469, + "grad_norm": 1.627470850944519, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8759679794311523, + "num_tokens": 698398810.0, + "step": 18307 + }, + { + "epoch": 2.328965780435059, + "grad_norm": 1.5217477083206177, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8740626573562622, + "num_tokens": 698443241.0, + "step": 18308 + }, + { + "epoch": 2.3290929907136495, + "grad_norm": 1.6791300773620605, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8829014301300049, + "num_tokens": 698481337.0, + "step": 18309 + }, + { + "epoch": 2.32922020099224, + "grad_norm": 1.3625209331512451, + "learning_rate": 1e-06, + "loss": 0.2669, + "mean_token_accuracy": 0.9027417898178101, + "num_tokens": 698522423.0, + "step": 18310 + }, + { + "epoch": 2.3293474112708306, + "grad_norm": 1.658066987991333, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8861972093582153, + "num_tokens": 698559317.0, + "step": 18311 + }, + { + "epoch": 2.329474621549421, + "grad_norm": 1.6718370914459229, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8901851177215576, + "num_tokens": 698592145.0, + "step": 18312 + }, + { + "epoch": 2.3296018318280116, + "grad_norm": 1.5534443855285645, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8772317171096802, + "num_tokens": 698633230.0, + "step": 18313 + }, + { + "epoch": 2.329729042106602, + "grad_norm": 1.5122705698013306, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8927414417266846, + "num_tokens": 698672032.0, + "step": 18314 + }, + { + "epoch": 2.3298562523851927, + "grad_norm": 1.6761242151260376, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8924938440322876, + "num_tokens": 698708264.0, + "step": 18315 + }, + { + "epoch": 2.329983462663783, + "grad_norm": 1.6902079582214355, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.882514476776123, + "num_tokens": 698744203.0, + "step": 18316 + }, + { + "epoch": 2.3301106729423737, + "grad_norm": 1.7553253173828125, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8829339742660522, + "num_tokens": 698777112.0, + "step": 18317 + }, + { + "epoch": 2.3302378832209643, + "grad_norm": 1.5707648992538452, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8868224024772644, + "num_tokens": 698813168.0, + "step": 18318 + }, + { + "epoch": 2.330365093499555, + "grad_norm": 1.5616101026535034, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8903473615646362, + "num_tokens": 698846952.0, + "step": 18319 + }, + { + "epoch": 2.3304923037781453, + "grad_norm": 1.5829373598098755, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8755576610565186, + "num_tokens": 698885440.0, + "step": 18320 + }, + { + "epoch": 2.330619514056736, + "grad_norm": 1.5547242164611816, + "learning_rate": 1e-06, + "loss": 0.274, + "mean_token_accuracy": 0.9013532996177673, + "num_tokens": 698921735.0, + "step": 18321 + }, + { + "epoch": 2.3307467243353264, + "grad_norm": 1.6858710050582886, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8816812038421631, + "num_tokens": 698960134.0, + "step": 18322 + }, + { + "epoch": 2.330873934613917, + "grad_norm": 1.597022533416748, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8939976692199707, + "num_tokens": 698993602.0, + "step": 18323 + }, + { + "epoch": 2.3310011448925074, + "grad_norm": 1.479396104812622, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.891544759273529, + "num_tokens": 699030670.0, + "step": 18324 + }, + { + "epoch": 2.331128355171098, + "grad_norm": 1.4843776226043701, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8930337429046631, + "num_tokens": 699073042.0, + "step": 18325 + }, + { + "epoch": 2.3312555654496885, + "grad_norm": 1.6600971221923828, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8853514194488525, + "num_tokens": 699109046.0, + "step": 18326 + }, + { + "epoch": 2.331382775728279, + "grad_norm": 1.4805448055267334, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8929603695869446, + "num_tokens": 699149803.0, + "step": 18327 + }, + { + "epoch": 2.3315099860068695, + "grad_norm": 1.5070446729660034, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.885333776473999, + "num_tokens": 699190457.0, + "step": 18328 + }, + { + "epoch": 2.3316371962854596, + "grad_norm": 1.5752229690551758, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8776578903198242, + "num_tokens": 699232366.0, + "step": 18329 + }, + { + "epoch": 2.3317644065640506, + "grad_norm": 1.5517942905426025, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8764281868934631, + "num_tokens": 699271795.0, + "step": 18330 + }, + { + "epoch": 2.3318916168426407, + "grad_norm": 1.5890976190567017, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8736475706100464, + "num_tokens": 699312496.0, + "step": 18331 + }, + { + "epoch": 2.332018827121231, + "grad_norm": 1.4669697284698486, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.892135739326477, + "num_tokens": 699352448.0, + "step": 18332 + }, + { + "epoch": 2.3321460373998217, + "grad_norm": 1.4687384366989136, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8796983957290649, + "num_tokens": 699397067.0, + "step": 18333 + }, + { + "epoch": 2.3322732476784123, + "grad_norm": 1.5253067016601562, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8931423425674438, + "num_tokens": 699437740.0, + "step": 18334 + }, + { + "epoch": 2.332400457957003, + "grad_norm": 1.511976718902588, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8940882682800293, + "num_tokens": 699473977.0, + "step": 18335 + }, + { + "epoch": 2.3325276682355933, + "grad_norm": 1.550023078918457, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8893259763717651, + "num_tokens": 699512907.0, + "step": 18336 + }, + { + "epoch": 2.332654878514184, + "grad_norm": 1.4267805814743042, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8887301087379456, + "num_tokens": 699558825.0, + "step": 18337 + }, + { + "epoch": 2.3327820887927744, + "grad_norm": 1.5435014963150024, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.868140697479248, + "num_tokens": 699599819.0, + "step": 18338 + }, + { + "epoch": 2.332909299071365, + "grad_norm": 1.745731234550476, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8822265863418579, + "num_tokens": 699629016.0, + "step": 18339 + }, + { + "epoch": 2.3330365093499554, + "grad_norm": 1.601924180984497, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8913007378578186, + "num_tokens": 699665038.0, + "step": 18340 + }, + { + "epoch": 2.333163719628546, + "grad_norm": 1.45085871219635, + "learning_rate": 1e-06, + "loss": 0.2734, + "mean_token_accuracy": 0.8998309373855591, + "num_tokens": 699704615.0, + "step": 18341 + }, + { + "epoch": 2.3332909299071365, + "grad_norm": 1.4602816104888916, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8828918933868408, + "num_tokens": 699750149.0, + "step": 18342 + }, + { + "epoch": 2.333418140185727, + "grad_norm": 1.5000828504562378, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8948168754577637, + "num_tokens": 699788445.0, + "step": 18343 + }, + { + "epoch": 2.3335453504643175, + "grad_norm": 1.8477225303649902, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8493239879608154, + "num_tokens": 699821389.0, + "step": 18344 + }, + { + "epoch": 2.333672560742908, + "grad_norm": 1.4336575269699097, + "learning_rate": 1e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.8939418196678162, + "num_tokens": 699861252.0, + "step": 18345 + }, + { + "epoch": 2.3337997710214986, + "grad_norm": 1.4477907419204712, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8825744986534119, + "num_tokens": 699905858.0, + "step": 18346 + }, + { + "epoch": 2.333926981300089, + "grad_norm": 1.605368971824646, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8775081634521484, + "num_tokens": 699943605.0, + "step": 18347 + }, + { + "epoch": 2.3340541915786797, + "grad_norm": 1.5742818117141724, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8752483129501343, + "num_tokens": 699982140.0, + "step": 18348 + }, + { + "epoch": 2.33418140185727, + "grad_norm": 1.6220468282699585, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.889905571937561, + "num_tokens": 700018053.0, + "step": 18349 + }, + { + "epoch": 2.3343086121358607, + "grad_norm": 1.5444844961166382, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8745721578598022, + "num_tokens": 700060612.0, + "step": 18350 + }, + { + "epoch": 2.3344358224144512, + "grad_norm": 1.4463509321212769, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8836663365364075, + "num_tokens": 700105401.0, + "step": 18351 + }, + { + "epoch": 2.3345630326930418, + "grad_norm": 1.6161335706710815, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8857446312904358, + "num_tokens": 700144225.0, + "step": 18352 + }, + { + "epoch": 2.3346902429716323, + "grad_norm": 1.6576906442642212, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8829776048660278, + "num_tokens": 700177982.0, + "step": 18353 + }, + { + "epoch": 2.3348174532502224, + "grad_norm": 1.5753666162490845, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8739758729934692, + "num_tokens": 700216871.0, + "step": 18354 + }, + { + "epoch": 2.3349446635288134, + "grad_norm": 1.5552217960357666, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8806018233299255, + "num_tokens": 700258729.0, + "step": 18355 + }, + { + "epoch": 2.3350718738074034, + "grad_norm": 1.5702170133590698, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8694703578948975, + "num_tokens": 700302407.0, + "step": 18356 + }, + { + "epoch": 2.335199084085994, + "grad_norm": 1.8124850988388062, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.886134922504425, + "num_tokens": 700337757.0, + "step": 18357 + }, + { + "epoch": 2.3353262943645845, + "grad_norm": 1.5158931016921997, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.876538872718811, + "num_tokens": 700377381.0, + "step": 18358 + }, + { + "epoch": 2.335453504643175, + "grad_norm": 1.642897367477417, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8809643387794495, + "num_tokens": 700415803.0, + "step": 18359 + }, + { + "epoch": 2.3355807149217656, + "grad_norm": 1.557500958442688, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8737260103225708, + "num_tokens": 700456511.0, + "step": 18360 + }, + { + "epoch": 2.335707925200356, + "grad_norm": 1.7538609504699707, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.896316647529602, + "num_tokens": 700488220.0, + "step": 18361 + }, + { + "epoch": 2.3358351354789466, + "grad_norm": 1.6436716318130493, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8852642178535461, + "num_tokens": 700524257.0, + "step": 18362 + }, + { + "epoch": 2.335962345757537, + "grad_norm": 1.5505280494689941, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.884736955165863, + "num_tokens": 700566664.0, + "step": 18363 + }, + { + "epoch": 2.3360895560361277, + "grad_norm": 1.4032618999481201, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8858044147491455, + "num_tokens": 700610342.0, + "step": 18364 + }, + { + "epoch": 2.336216766314718, + "grad_norm": 1.5569297075271606, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.891051173210144, + "num_tokens": 700644316.0, + "step": 18365 + }, + { + "epoch": 2.3363439765933087, + "grad_norm": 1.6087300777435303, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8844659924507141, + "num_tokens": 700676142.0, + "step": 18366 + }, + { + "epoch": 2.3364711868718993, + "grad_norm": 1.510857105255127, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8892402052879333, + "num_tokens": 700717015.0, + "step": 18367 + }, + { + "epoch": 2.33659839715049, + "grad_norm": 1.545472502708435, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8798396587371826, + "num_tokens": 700756634.0, + "step": 18368 + }, + { + "epoch": 2.3367256074290803, + "grad_norm": 1.5544013977050781, + "learning_rate": 1e-06, + "loss": 0.2814, + "mean_token_accuracy": 0.8999930620193481, + "num_tokens": 700793601.0, + "step": 18369 + }, + { + "epoch": 2.336852817707671, + "grad_norm": 1.521613597869873, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8843809366226196, + "num_tokens": 700832719.0, + "step": 18370 + }, + { + "epoch": 2.3369800279862614, + "grad_norm": 1.5898337364196777, + "learning_rate": 1e-06, + "loss": 0.2862, + "mean_token_accuracy": 0.8950214982032776, + "num_tokens": 700868450.0, + "step": 18371 + }, + { + "epoch": 2.337107238264852, + "grad_norm": 1.570701241493225, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.882641613483429, + "num_tokens": 700906796.0, + "step": 18372 + }, + { + "epoch": 2.3372344485434424, + "grad_norm": 1.4069509506225586, + "learning_rate": 1e-06, + "loss": 0.2887, + "mean_token_accuracy": 0.8967558145523071, + "num_tokens": 700948618.0, + "step": 18373 + }, + { + "epoch": 2.337361658822033, + "grad_norm": 1.40919029712677, + "learning_rate": 1e-06, + "loss": 0.2674, + "mean_token_accuracy": 0.9038217663764954, + "num_tokens": 700988986.0, + "step": 18374 + }, + { + "epoch": 2.3374888691006235, + "grad_norm": 1.6913954019546509, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8788836002349854, + "num_tokens": 701024665.0, + "step": 18375 + }, + { + "epoch": 2.337616079379214, + "grad_norm": 1.5866479873657227, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8864930272102356, + "num_tokens": 701063889.0, + "step": 18376 + }, + { + "epoch": 2.3377432896578045, + "grad_norm": 1.5378836393356323, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8760380744934082, + "num_tokens": 701107915.0, + "step": 18377 + }, + { + "epoch": 2.337870499936395, + "grad_norm": 1.5754433870315552, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8795993328094482, + "num_tokens": 701146201.0, + "step": 18378 + }, + { + "epoch": 2.337997710214985, + "grad_norm": 1.5846678018569946, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8735726475715637, + "num_tokens": 701187909.0, + "step": 18379 + }, + { + "epoch": 2.338124920493576, + "grad_norm": 1.5572222471237183, + "learning_rate": 1e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.8920245170593262, + "num_tokens": 701224490.0, + "step": 18380 + }, + { + "epoch": 2.338252130772166, + "grad_norm": 1.5336054563522339, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8696606755256653, + "num_tokens": 701268093.0, + "step": 18381 + }, + { + "epoch": 2.3383793410507567, + "grad_norm": 1.620529055595398, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8854561448097229, + "num_tokens": 701305299.0, + "step": 18382 + }, + { + "epoch": 2.3385065513293473, + "grad_norm": 1.7003238201141357, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8833189010620117, + "num_tokens": 701336886.0, + "step": 18383 + }, + { + "epoch": 2.338633761607938, + "grad_norm": 1.515684962272644, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8942216038703918, + "num_tokens": 701377114.0, + "step": 18384 + }, + { + "epoch": 2.3387609718865283, + "grad_norm": 1.55954110622406, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8843860626220703, + "num_tokens": 701415590.0, + "step": 18385 + }, + { + "epoch": 2.338888182165119, + "grad_norm": 1.561113715171814, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8883385062217712, + "num_tokens": 701454323.0, + "step": 18386 + }, + { + "epoch": 2.3390153924437094, + "grad_norm": 1.4739147424697876, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8788660168647766, + "num_tokens": 701495331.0, + "step": 18387 + }, + { + "epoch": 2.3391426027223, + "grad_norm": 1.5588685274124146, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8974287509918213, + "num_tokens": 701531896.0, + "step": 18388 + }, + { + "epoch": 2.3392698130008904, + "grad_norm": 1.579777479171753, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8873528242111206, + "num_tokens": 701567200.0, + "step": 18389 + }, + { + "epoch": 2.339397023279481, + "grad_norm": 1.6176968812942505, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8816871643066406, + "num_tokens": 701602254.0, + "step": 18390 + }, + { + "epoch": 2.3395242335580715, + "grad_norm": 1.4076182842254639, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.885338306427002, + "num_tokens": 701647037.0, + "step": 18391 + }, + { + "epoch": 2.339651443836662, + "grad_norm": 1.5295835733413696, + "learning_rate": 1e-06, + "loss": 0.2683, + "mean_token_accuracy": 0.8999810218811035, + "num_tokens": 701680003.0, + "step": 18392 + }, + { + "epoch": 2.3397786541152525, + "grad_norm": 1.4246089458465576, + "learning_rate": 1e-06, + "loss": 0.2798, + "mean_token_accuracy": 0.8989339470863342, + "num_tokens": 701722388.0, + "step": 18393 + }, + { + "epoch": 2.339905864393843, + "grad_norm": 1.5687278509140015, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8935208320617676, + "num_tokens": 701758184.0, + "step": 18394 + }, + { + "epoch": 2.3400330746724336, + "grad_norm": 1.5813199281692505, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8886076211929321, + "num_tokens": 701793656.0, + "step": 18395 + }, + { + "epoch": 2.340160284951024, + "grad_norm": 1.5111243724822998, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8906494379043579, + "num_tokens": 701833985.0, + "step": 18396 + }, + { + "epoch": 2.3402874952296147, + "grad_norm": 1.7500747442245483, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8846931457519531, + "num_tokens": 701866499.0, + "step": 18397 + }, + { + "epoch": 2.340414705508205, + "grad_norm": 1.3938815593719482, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8875356316566467, + "num_tokens": 701912740.0, + "step": 18398 + }, + { + "epoch": 2.3405419157867957, + "grad_norm": 1.4329661130905151, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8854027986526489, + "num_tokens": 701957816.0, + "step": 18399 + }, + { + "epoch": 2.3406691260653862, + "grad_norm": 1.5896767377853394, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.880562424659729, + "num_tokens": 701999032.0, + "step": 18400 + }, + { + "epoch": 2.3407963363439768, + "grad_norm": 1.6783350706100464, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8871830701828003, + "num_tokens": 702033751.0, + "step": 18401 + }, + { + "epoch": 2.3409235466225673, + "grad_norm": 1.662906527519226, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8753350973129272, + "num_tokens": 702072121.0, + "step": 18402 + }, + { + "epoch": 2.341050756901158, + "grad_norm": 1.753665566444397, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8888735771179199, + "num_tokens": 702104170.0, + "step": 18403 + }, + { + "epoch": 2.341177967179748, + "grad_norm": 1.6522290706634521, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.884666383266449, + "num_tokens": 702138561.0, + "step": 18404 + }, + { + "epoch": 2.341305177458339, + "grad_norm": 1.6932692527770996, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8744645714759827, + "num_tokens": 702175743.0, + "step": 18405 + }, + { + "epoch": 2.341432387736929, + "grad_norm": 1.760945439338684, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8664718270301819, + "num_tokens": 702211013.0, + "step": 18406 + }, + { + "epoch": 2.3415595980155195, + "grad_norm": 1.739674687385559, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8754943609237671, + "num_tokens": 702248950.0, + "step": 18407 + }, + { + "epoch": 2.34168680829411, + "grad_norm": 1.5767780542373657, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8743850588798523, + "num_tokens": 702288992.0, + "step": 18408 + }, + { + "epoch": 2.3418140185727006, + "grad_norm": 1.6321617364883423, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8881215453147888, + "num_tokens": 702322879.0, + "step": 18409 + }, + { + "epoch": 2.341941228851291, + "grad_norm": 1.5214353799819946, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8921285271644592, + "num_tokens": 702363303.0, + "step": 18410 + }, + { + "epoch": 2.3420684391298816, + "grad_norm": 1.6361831426620483, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8886675834655762, + "num_tokens": 702397300.0, + "step": 18411 + }, + { + "epoch": 2.342195649408472, + "grad_norm": 1.5268300771713257, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8893221020698547, + "num_tokens": 702435689.0, + "step": 18412 + }, + { + "epoch": 2.3423228596870627, + "grad_norm": 1.493998646736145, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.885238528251648, + "num_tokens": 702478765.0, + "step": 18413 + }, + { + "epoch": 2.342450069965653, + "grad_norm": 1.701686978340149, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.880292534828186, + "num_tokens": 702512025.0, + "step": 18414 + }, + { + "epoch": 2.3425772802442437, + "grad_norm": 1.5012681484222412, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8942669630050659, + "num_tokens": 702550568.0, + "step": 18415 + }, + { + "epoch": 2.3427044905228342, + "grad_norm": 1.4903969764709473, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8957751393318176, + "num_tokens": 702591475.0, + "step": 18416 + }, + { + "epoch": 2.3428317008014248, + "grad_norm": 1.5960822105407715, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8793493509292603, + "num_tokens": 702629145.0, + "step": 18417 + }, + { + "epoch": 2.3429589110800153, + "grad_norm": 1.6873823404312134, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8788186311721802, + "num_tokens": 702664285.0, + "step": 18418 + }, + { + "epoch": 2.343086121358606, + "grad_norm": 1.480771780014038, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8825263977050781, + "num_tokens": 702705329.0, + "step": 18419 + }, + { + "epoch": 2.3432133316371964, + "grad_norm": 1.5746408700942993, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8881206512451172, + "num_tokens": 702741873.0, + "step": 18420 + }, + { + "epoch": 2.343340541915787, + "grad_norm": 1.7755491733551025, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.886089563369751, + "num_tokens": 702774638.0, + "step": 18421 + }, + { + "epoch": 2.3434677521943774, + "grad_norm": 1.6603702306747437, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8904026746749878, + "num_tokens": 702806017.0, + "step": 18422 + }, + { + "epoch": 2.343594962472968, + "grad_norm": 1.577582836151123, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8800644874572754, + "num_tokens": 702844491.0, + "step": 18423 + }, + { + "epoch": 2.3437221727515585, + "grad_norm": 1.5345356464385986, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8801041841506958, + "num_tokens": 702885376.0, + "step": 18424 + }, + { + "epoch": 2.343849383030149, + "grad_norm": 1.5221073627471924, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8902852535247803, + "num_tokens": 702925133.0, + "step": 18425 + }, + { + "epoch": 2.3439765933087395, + "grad_norm": 1.5032258033752441, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8928030729293823, + "num_tokens": 702963519.0, + "step": 18426 + }, + { + "epoch": 2.3441038035873296, + "grad_norm": 1.6678696870803833, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8919405937194824, + "num_tokens": 702999307.0, + "step": 18427 + }, + { + "epoch": 2.3442310138659206, + "grad_norm": 1.5796177387237549, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8718292117118835, + "num_tokens": 703038573.0, + "step": 18428 + }, + { + "epoch": 2.3443582241445107, + "grad_norm": 1.5472149848937988, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8923732042312622, + "num_tokens": 703076720.0, + "step": 18429 + }, + { + "epoch": 2.344485434423101, + "grad_norm": 1.5529346466064453, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8675975799560547, + "num_tokens": 703121050.0, + "step": 18430 + }, + { + "epoch": 2.3446126447016917, + "grad_norm": 1.6859251260757446, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.862032413482666, + "num_tokens": 703162301.0, + "step": 18431 + }, + { + "epoch": 2.3447398549802823, + "grad_norm": 1.473111629486084, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8906470537185669, + "num_tokens": 703200560.0, + "step": 18432 + }, + { + "epoch": 2.344867065258873, + "grad_norm": 1.3528125286102295, + "learning_rate": 1e-06, + "loss": 0.2817, + "mean_token_accuracy": 0.8987141847610474, + "num_tokens": 703241608.0, + "step": 18433 + }, + { + "epoch": 2.3449942755374633, + "grad_norm": 1.4078772068023682, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8928386569023132, + "num_tokens": 703283380.0, + "step": 18434 + }, + { + "epoch": 2.345121485816054, + "grad_norm": 1.6934536695480347, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8896053433418274, + "num_tokens": 703316833.0, + "step": 18435 + }, + { + "epoch": 2.3452486960946444, + "grad_norm": 1.5706602334976196, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8754958510398865, + "num_tokens": 703355247.0, + "step": 18436 + }, + { + "epoch": 2.345375906373235, + "grad_norm": 1.591866135597229, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8739291429519653, + "num_tokens": 703394332.0, + "step": 18437 + }, + { + "epoch": 2.3455031166518254, + "grad_norm": 1.5022528171539307, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.889803409576416, + "num_tokens": 703435355.0, + "step": 18438 + }, + { + "epoch": 2.345630326930416, + "grad_norm": 1.8269442319869995, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8780146837234497, + "num_tokens": 703465809.0, + "step": 18439 + }, + { + "epoch": 2.3457575372090065, + "grad_norm": 1.6849640607833862, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8804033994674683, + "num_tokens": 703501258.0, + "step": 18440 + }, + { + "epoch": 2.345884747487597, + "grad_norm": 1.4854137897491455, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8812625408172607, + "num_tokens": 703545172.0, + "step": 18441 + }, + { + "epoch": 2.3460119577661875, + "grad_norm": 1.633828043937683, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8736779093742371, + "num_tokens": 703584038.0, + "step": 18442 + }, + { + "epoch": 2.346139168044778, + "grad_norm": 1.5586349964141846, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8825227618217468, + "num_tokens": 703623773.0, + "step": 18443 + }, + { + "epoch": 2.3462663783233686, + "grad_norm": 1.6091032028198242, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8781938552856445, + "num_tokens": 703657061.0, + "step": 18444 + }, + { + "epoch": 2.346393588601959, + "grad_norm": 1.5049631595611572, + "learning_rate": 1e-06, + "loss": 0.2893, + "mean_token_accuracy": 0.8976614475250244, + "num_tokens": 703695509.0, + "step": 18445 + }, + { + "epoch": 2.3465207988805497, + "grad_norm": 1.3608222007751465, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8858449459075928, + "num_tokens": 703746332.0, + "step": 18446 + }, + { + "epoch": 2.34664800915914, + "grad_norm": 1.5929468870162964, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8915866613388062, + "num_tokens": 703784693.0, + "step": 18447 + }, + { + "epoch": 2.3467752194377307, + "grad_norm": 1.4740663766860962, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8941127061843872, + "num_tokens": 703829931.0, + "step": 18448 + }, + { + "epoch": 2.3469024297163212, + "grad_norm": 1.5603376626968384, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8694924712181091, + "num_tokens": 703872698.0, + "step": 18449 + }, + { + "epoch": 2.3470296399949118, + "grad_norm": 1.552647352218628, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8880507946014404, + "num_tokens": 703913277.0, + "step": 18450 + }, + { + "epoch": 2.3471568502735023, + "grad_norm": 1.5774240493774414, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8900299072265625, + "num_tokens": 703949899.0, + "step": 18451 + }, + { + "epoch": 2.3472840605520924, + "grad_norm": 1.5717320442199707, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8922878503799438, + "num_tokens": 703986127.0, + "step": 18452 + }, + { + "epoch": 2.3474112708306833, + "grad_norm": 1.5704357624053955, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8792930841445923, + "num_tokens": 704026385.0, + "step": 18453 + }, + { + "epoch": 2.3475384811092734, + "grad_norm": 1.6093388795852661, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8887123465538025, + "num_tokens": 704061007.0, + "step": 18454 + }, + { + "epoch": 2.347665691387864, + "grad_norm": 1.5644211769104004, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8929212689399719, + "num_tokens": 704100669.0, + "step": 18455 + }, + { + "epoch": 2.3477929016664545, + "grad_norm": 1.5580068826675415, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8832002878189087, + "num_tokens": 704141197.0, + "step": 18456 + }, + { + "epoch": 2.347920111945045, + "grad_norm": 1.7840478420257568, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8861658573150635, + "num_tokens": 704169602.0, + "step": 18457 + }, + { + "epoch": 2.3480473222236355, + "grad_norm": 1.3368268013000488, + "learning_rate": 1e-06, + "loss": 0.282, + "mean_token_accuracy": 0.8977800011634827, + "num_tokens": 704217455.0, + "step": 18458 + }, + { + "epoch": 2.348174532502226, + "grad_norm": 1.5067014694213867, + "learning_rate": 1e-06, + "loss": 0.2787, + "mean_token_accuracy": 0.896547794342041, + "num_tokens": 704252775.0, + "step": 18459 + }, + { + "epoch": 2.3483017427808166, + "grad_norm": 1.6879756450653076, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8731822967529297, + "num_tokens": 704287928.0, + "step": 18460 + }, + { + "epoch": 2.348428953059407, + "grad_norm": 1.55950927734375, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8821151256561279, + "num_tokens": 704330085.0, + "step": 18461 + }, + { + "epoch": 2.3485561633379977, + "grad_norm": 1.5526195764541626, + "learning_rate": 1e-06, + "loss": 0.2793, + "mean_token_accuracy": 0.8988679051399231, + "num_tokens": 704366573.0, + "step": 18462 + }, + { + "epoch": 2.348683373616588, + "grad_norm": 1.7061291933059692, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8823432326316833, + "num_tokens": 704404699.0, + "step": 18463 + }, + { + "epoch": 2.3488105838951787, + "grad_norm": 1.5058891773223877, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8925515413284302, + "num_tokens": 704443502.0, + "step": 18464 + }, + { + "epoch": 2.3489377941737692, + "grad_norm": 1.722261667251587, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.883647084236145, + "num_tokens": 704475840.0, + "step": 18465 + }, + { + "epoch": 2.3490650044523598, + "grad_norm": 1.635980248451233, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.871628999710083, + "num_tokens": 704514756.0, + "step": 18466 + }, + { + "epoch": 2.3491922147309503, + "grad_norm": 1.6304965019226074, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8955157995223999, + "num_tokens": 704549051.0, + "step": 18467 + }, + { + "epoch": 2.349319425009541, + "grad_norm": 1.5376346111297607, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8877578973770142, + "num_tokens": 704586527.0, + "step": 18468 + }, + { + "epoch": 2.3494466352881314, + "grad_norm": 1.4961782693862915, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8909443616867065, + "num_tokens": 704629301.0, + "step": 18469 + }, + { + "epoch": 2.349573845566722, + "grad_norm": 1.5435194969177246, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8898802995681763, + "num_tokens": 704666726.0, + "step": 18470 + }, + { + "epoch": 2.3497010558453124, + "grad_norm": 1.6084123849868774, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8903495073318481, + "num_tokens": 704705491.0, + "step": 18471 + }, + { + "epoch": 2.349828266123903, + "grad_norm": 1.613608479499817, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.876761794090271, + "num_tokens": 704743732.0, + "step": 18472 + }, + { + "epoch": 2.3499554764024935, + "grad_norm": 1.5196715593338013, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8884229063987732, + "num_tokens": 704781225.0, + "step": 18473 + }, + { + "epoch": 2.350082686681084, + "grad_norm": 1.5641192197799683, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8901851773262024, + "num_tokens": 704816960.0, + "step": 18474 + }, + { + "epoch": 2.3502098969596745, + "grad_norm": 1.5171047449111938, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.89457768201828, + "num_tokens": 704855530.0, + "step": 18475 + }, + { + "epoch": 2.350337107238265, + "grad_norm": 1.3871747255325317, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8936213254928589, + "num_tokens": 704899330.0, + "step": 18476 + }, + { + "epoch": 2.350464317516855, + "grad_norm": 1.6196773052215576, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8832516670227051, + "num_tokens": 704938550.0, + "step": 18477 + }, + { + "epoch": 2.350591527795446, + "grad_norm": 1.8017570972442627, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8704347610473633, + "num_tokens": 704970749.0, + "step": 18478 + }, + { + "epoch": 2.350718738074036, + "grad_norm": 1.4696142673492432, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8920445442199707, + "num_tokens": 705012472.0, + "step": 18479 + }, + { + "epoch": 2.3508459483526267, + "grad_norm": 1.8076664209365845, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8786317706108093, + "num_tokens": 705041755.0, + "step": 18480 + }, + { + "epoch": 2.3509731586312173, + "grad_norm": 1.5736364126205444, + "learning_rate": 1e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.8959691524505615, + "num_tokens": 705075678.0, + "step": 18481 + }, + { + "epoch": 2.351100368909808, + "grad_norm": 1.4658383131027222, + "learning_rate": 1e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.894487738609314, + "num_tokens": 705116709.0, + "step": 18482 + }, + { + "epoch": 2.3512275791883983, + "grad_norm": 1.5387604236602783, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8871303796768188, + "num_tokens": 705155159.0, + "step": 18483 + }, + { + "epoch": 2.351354789466989, + "grad_norm": 1.6153795719146729, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8849348425865173, + "num_tokens": 705193567.0, + "step": 18484 + }, + { + "epoch": 2.3514819997455794, + "grad_norm": 1.6251956224441528, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8924992084503174, + "num_tokens": 705228210.0, + "step": 18485 + }, + { + "epoch": 2.35160921002417, + "grad_norm": 1.700434684753418, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8862086534500122, + "num_tokens": 705263904.0, + "step": 18486 + }, + { + "epoch": 2.3517364203027604, + "grad_norm": 1.6277037858963013, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8849852085113525, + "num_tokens": 705299147.0, + "step": 18487 + }, + { + "epoch": 2.351863630581351, + "grad_norm": 1.6454333066940308, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8832845687866211, + "num_tokens": 705336122.0, + "step": 18488 + }, + { + "epoch": 2.3519908408599415, + "grad_norm": 1.7160308361053467, + "learning_rate": 1e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.8925296664237976, + "num_tokens": 705368455.0, + "step": 18489 + }, + { + "epoch": 2.352118051138532, + "grad_norm": 1.593696117401123, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8671291470527649, + "num_tokens": 705409217.0, + "step": 18490 + }, + { + "epoch": 2.3522452614171225, + "grad_norm": 1.633824348449707, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8955010175704956, + "num_tokens": 705443651.0, + "step": 18491 + }, + { + "epoch": 2.352372471695713, + "grad_norm": 1.4581058025360107, + "learning_rate": 1e-06, + "loss": 0.277, + "mean_token_accuracy": 0.8966182470321655, + "num_tokens": 705481667.0, + "step": 18492 + }, + { + "epoch": 2.3524996819743036, + "grad_norm": 1.6426429748535156, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8846368193626404, + "num_tokens": 705518984.0, + "step": 18493 + }, + { + "epoch": 2.352626892252894, + "grad_norm": 1.6860429048538208, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8638741374015808, + "num_tokens": 705557030.0, + "step": 18494 + }, + { + "epoch": 2.3527541025314846, + "grad_norm": 1.5659292936325073, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8809524178504944, + "num_tokens": 705593422.0, + "step": 18495 + }, + { + "epoch": 2.352881312810075, + "grad_norm": 1.6475483179092407, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8824688196182251, + "num_tokens": 705627653.0, + "step": 18496 + }, + { + "epoch": 2.3530085230886657, + "grad_norm": 1.3966350555419922, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8885754346847534, + "num_tokens": 705676046.0, + "step": 18497 + }, + { + "epoch": 2.3531357333672562, + "grad_norm": 1.5362025499343872, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8865858316421509, + "num_tokens": 705715285.0, + "step": 18498 + }, + { + "epoch": 2.3532629436458468, + "grad_norm": 1.630361557006836, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8830256462097168, + "num_tokens": 705750320.0, + "step": 18499 + }, + { + "epoch": 2.353390153924437, + "grad_norm": 1.6417185068130493, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8874479532241821, + "num_tokens": 705786804.0, + "step": 18500 + }, + { + "epoch": 2.353517364203028, + "grad_norm": 1.5456933975219727, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8957660794258118, + "num_tokens": 705823583.0, + "step": 18501 + }, + { + "epoch": 2.353644574481618, + "grad_norm": 1.5258574485778809, + "learning_rate": 1e-06, + "loss": 0.275, + "mean_token_accuracy": 0.9029736518859863, + "num_tokens": 705860229.0, + "step": 18502 + }, + { + "epoch": 2.353771784760209, + "grad_norm": 1.5840226411819458, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8907043933868408, + "num_tokens": 705897436.0, + "step": 18503 + }, + { + "epoch": 2.353898995038799, + "grad_norm": 1.7369276285171509, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8796049952507019, + "num_tokens": 705934564.0, + "step": 18504 + }, + { + "epoch": 2.3540262053173895, + "grad_norm": 1.7316126823425293, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8807148933410645, + "num_tokens": 705969869.0, + "step": 18505 + }, + { + "epoch": 2.35415341559598, + "grad_norm": 1.488827109336853, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8845734000205994, + "num_tokens": 706009951.0, + "step": 18506 + }, + { + "epoch": 2.3542806258745705, + "grad_norm": 1.516400694847107, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8882994651794434, + "num_tokens": 706050811.0, + "step": 18507 + }, + { + "epoch": 2.354407836153161, + "grad_norm": 1.5478055477142334, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8850671052932739, + "num_tokens": 706088290.0, + "step": 18508 + }, + { + "epoch": 2.3545350464317516, + "grad_norm": 1.6974573135375977, + "learning_rate": 1e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.89593505859375, + "num_tokens": 706123270.0, + "step": 18509 + }, + { + "epoch": 2.354662256710342, + "grad_norm": 1.6706205606460571, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8912570476531982, + "num_tokens": 706155330.0, + "step": 18510 + }, + { + "epoch": 2.3547894669889327, + "grad_norm": 1.5685325860977173, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8921141028404236, + "num_tokens": 706189855.0, + "step": 18511 + }, + { + "epoch": 2.354916677267523, + "grad_norm": 1.348974585533142, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8931007385253906, + "num_tokens": 706233943.0, + "step": 18512 + }, + { + "epoch": 2.3550438875461137, + "grad_norm": 1.582675576210022, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8840120434761047, + "num_tokens": 706269785.0, + "step": 18513 + }, + { + "epoch": 2.3551710978247042, + "grad_norm": 1.5604827404022217, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8833118677139282, + "num_tokens": 706308780.0, + "step": 18514 + }, + { + "epoch": 2.3552983081032948, + "grad_norm": 1.666793942451477, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8905166387557983, + "num_tokens": 706343641.0, + "step": 18515 + }, + { + "epoch": 2.3554255183818853, + "grad_norm": 1.624727487564087, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8926377296447754, + "num_tokens": 706378270.0, + "step": 18516 + }, + { + "epoch": 2.355552728660476, + "grad_norm": 1.5029338598251343, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8731234073638916, + "num_tokens": 706419424.0, + "step": 18517 + }, + { + "epoch": 2.3556799389390664, + "grad_norm": 1.3911502361297607, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8860791921615601, + "num_tokens": 706464294.0, + "step": 18518 + }, + { + "epoch": 2.355807149217657, + "grad_norm": 1.6381022930145264, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.886554479598999, + "num_tokens": 706499584.0, + "step": 18519 + }, + { + "epoch": 2.3559343594962474, + "grad_norm": 1.5116405487060547, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8745548725128174, + "num_tokens": 706539542.0, + "step": 18520 + }, + { + "epoch": 2.356061569774838, + "grad_norm": 1.5024892091751099, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8794858455657959, + "num_tokens": 706580115.0, + "step": 18521 + }, + { + "epoch": 2.3561887800534285, + "grad_norm": 1.7360528707504272, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8810082674026489, + "num_tokens": 706615576.0, + "step": 18522 + }, + { + "epoch": 2.356315990332019, + "grad_norm": 1.646437406539917, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8745735883712769, + "num_tokens": 706652686.0, + "step": 18523 + }, + { + "epoch": 2.3564432006106095, + "grad_norm": 1.640510082244873, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.880501389503479, + "num_tokens": 706689967.0, + "step": 18524 + }, + { + "epoch": 2.3565704108891996, + "grad_norm": 1.4929616451263428, + "learning_rate": 1e-06, + "loss": 0.287, + "mean_token_accuracy": 0.8969850540161133, + "num_tokens": 706727174.0, + "step": 18525 + }, + { + "epoch": 2.3566976211677906, + "grad_norm": 1.6667163372039795, + "learning_rate": 1e-06, + "loss": 0.2711, + "mean_token_accuracy": 0.9000938534736633, + "num_tokens": 706757463.0, + "step": 18526 + }, + { + "epoch": 2.3568248314463807, + "grad_norm": 1.7146155834197998, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8714062571525574, + "num_tokens": 706793533.0, + "step": 18527 + }, + { + "epoch": 2.356952041724971, + "grad_norm": 1.5703142881393433, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8903486728668213, + "num_tokens": 706830112.0, + "step": 18528 + }, + { + "epoch": 2.3570792520035617, + "grad_norm": 1.4181551933288574, + "learning_rate": 1e-06, + "loss": 0.2619, + "mean_token_accuracy": 0.9035447835922241, + "num_tokens": 706871315.0, + "step": 18529 + }, + { + "epoch": 2.3572064622821522, + "grad_norm": 1.5334155559539795, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8888953924179077, + "num_tokens": 706912879.0, + "step": 18530 + }, + { + "epoch": 2.3573336725607428, + "grad_norm": 1.6081626415252686, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8767493963241577, + "num_tokens": 706952199.0, + "step": 18531 + }, + { + "epoch": 2.3574608828393333, + "grad_norm": 1.5077764987945557, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8932693600654602, + "num_tokens": 706990489.0, + "step": 18532 + }, + { + "epoch": 2.357588093117924, + "grad_norm": 1.6237552165985107, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8907960653305054, + "num_tokens": 707027735.0, + "step": 18533 + }, + { + "epoch": 2.3577153033965144, + "grad_norm": 1.710761308670044, + "learning_rate": 1e-06, + "loss": 0.2827, + "mean_token_accuracy": 0.8991013765335083, + "num_tokens": 707058363.0, + "step": 18534 + }, + { + "epoch": 2.357842513675105, + "grad_norm": 1.4682369232177734, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8871446847915649, + "num_tokens": 707101309.0, + "step": 18535 + }, + { + "epoch": 2.3579697239536954, + "grad_norm": 1.6574236154556274, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8840187788009644, + "num_tokens": 707141872.0, + "step": 18536 + }, + { + "epoch": 2.358096934232286, + "grad_norm": 1.4878180027008057, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8906247615814209, + "num_tokens": 707181598.0, + "step": 18537 + }, + { + "epoch": 2.3582241445108765, + "grad_norm": 1.5234451293945312, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8791669607162476, + "num_tokens": 707225589.0, + "step": 18538 + }, + { + "epoch": 2.358351354789467, + "grad_norm": 1.4800180196762085, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8874319791793823, + "num_tokens": 707270828.0, + "step": 18539 + }, + { + "epoch": 2.3584785650680575, + "grad_norm": 1.5831186771392822, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8950039148330688, + "num_tokens": 707305743.0, + "step": 18540 + }, + { + "epoch": 2.358605775346648, + "grad_norm": 1.6023378372192383, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8848965764045715, + "num_tokens": 707344019.0, + "step": 18541 + }, + { + "epoch": 2.3587329856252386, + "grad_norm": 1.5089155435562134, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8904311656951904, + "num_tokens": 707381586.0, + "step": 18542 + }, + { + "epoch": 2.358860195903829, + "grad_norm": 1.6184011697769165, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8746354579925537, + "num_tokens": 707417421.0, + "step": 18543 + }, + { + "epoch": 2.3589874061824196, + "grad_norm": 1.5343186855316162, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8726271390914917, + "num_tokens": 707459206.0, + "step": 18544 + }, + { + "epoch": 2.35911461646101, + "grad_norm": 1.493050456047058, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8834531307220459, + "num_tokens": 707500205.0, + "step": 18545 + }, + { + "epoch": 2.3592418267396007, + "grad_norm": 1.4755289554595947, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8936039209365845, + "num_tokens": 707537215.0, + "step": 18546 + }, + { + "epoch": 2.3593690370181912, + "grad_norm": 1.5550835132598877, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8807138204574585, + "num_tokens": 707578224.0, + "step": 18547 + }, + { + "epoch": 2.3594962472967818, + "grad_norm": 1.599374771118164, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8925272822380066, + "num_tokens": 707614138.0, + "step": 18548 + }, + { + "epoch": 2.3596234575753723, + "grad_norm": 1.6491780281066895, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8861978650093079, + "num_tokens": 707656415.0, + "step": 18549 + }, + { + "epoch": 2.3597506678539624, + "grad_norm": 1.5698459148406982, + "learning_rate": 1e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.8973492383956909, + "num_tokens": 707693106.0, + "step": 18550 + }, + { + "epoch": 2.3598778781325533, + "grad_norm": 1.5423222780227661, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8968961238861084, + "num_tokens": 707729425.0, + "step": 18551 + }, + { + "epoch": 2.3600050884111434, + "grad_norm": 1.6928749084472656, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8760666251182556, + "num_tokens": 707764097.0, + "step": 18552 + }, + { + "epoch": 2.360132298689734, + "grad_norm": 1.4617069959640503, + "learning_rate": 1e-06, + "loss": 0.2853, + "mean_token_accuracy": 0.8961077332496643, + "num_tokens": 707801757.0, + "step": 18553 + }, + { + "epoch": 2.3602595089683245, + "grad_norm": 1.575425624847412, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8947035074234009, + "num_tokens": 707840524.0, + "step": 18554 + }, + { + "epoch": 2.360386719246915, + "grad_norm": 1.5419765710830688, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8823354244232178, + "num_tokens": 707876627.0, + "step": 18555 + }, + { + "epoch": 2.3605139295255055, + "grad_norm": 1.7156155109405518, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8870805501937866, + "num_tokens": 707909119.0, + "step": 18556 + }, + { + "epoch": 2.360641139804096, + "grad_norm": 1.5664044618606567, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8866519331932068, + "num_tokens": 707945428.0, + "step": 18557 + }, + { + "epoch": 2.3607683500826866, + "grad_norm": 1.6227279901504517, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8917100429534912, + "num_tokens": 707980565.0, + "step": 18558 + }, + { + "epoch": 2.360895560361277, + "grad_norm": 1.74659264087677, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8730088472366333, + "num_tokens": 708017737.0, + "step": 18559 + }, + { + "epoch": 2.3610227706398677, + "grad_norm": 1.5449094772338867, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8865371346473694, + "num_tokens": 708057416.0, + "step": 18560 + }, + { + "epoch": 2.361149980918458, + "grad_norm": 1.5561882257461548, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8871887922286987, + "num_tokens": 708094590.0, + "step": 18561 + }, + { + "epoch": 2.3612771911970487, + "grad_norm": 1.5010000467300415, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.891430139541626, + "num_tokens": 708135915.0, + "step": 18562 + }, + { + "epoch": 2.3614044014756392, + "grad_norm": 1.47294282913208, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8906596899032593, + "num_tokens": 708177335.0, + "step": 18563 + }, + { + "epoch": 2.3615316117542298, + "grad_norm": 1.5961889028549194, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8881111741065979, + "num_tokens": 708210990.0, + "step": 18564 + }, + { + "epoch": 2.3616588220328203, + "grad_norm": 1.6037242412567139, + "learning_rate": 1e-06, + "loss": 0.2704, + "mean_token_accuracy": 0.9010598659515381, + "num_tokens": 708247060.0, + "step": 18565 + }, + { + "epoch": 2.361786032311411, + "grad_norm": 1.4851820468902588, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8830997347831726, + "num_tokens": 708287487.0, + "step": 18566 + }, + { + "epoch": 2.3619132425900013, + "grad_norm": 1.5883970260620117, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8914933800697327, + "num_tokens": 708321224.0, + "step": 18567 + }, + { + "epoch": 2.362040452868592, + "grad_norm": 1.6386951208114624, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8925528526306152, + "num_tokens": 708351462.0, + "step": 18568 + }, + { + "epoch": 2.3621676631471824, + "grad_norm": 1.6588985919952393, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8920021653175354, + "num_tokens": 708386078.0, + "step": 18569 + }, + { + "epoch": 2.362294873425773, + "grad_norm": 1.8371657133102417, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8847137093544006, + "num_tokens": 708417284.0, + "step": 18570 + }, + { + "epoch": 2.3624220837043635, + "grad_norm": 1.6587623357772827, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8689637780189514, + "num_tokens": 708452319.0, + "step": 18571 + }, + { + "epoch": 2.362549293982954, + "grad_norm": 1.5784324407577515, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8928198218345642, + "num_tokens": 708488662.0, + "step": 18572 + }, + { + "epoch": 2.3626765042615445, + "grad_norm": 1.6417094469070435, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8829623460769653, + "num_tokens": 708523278.0, + "step": 18573 + }, + { + "epoch": 2.362803714540135, + "grad_norm": 1.7590417861938477, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8893482089042664, + "num_tokens": 708554461.0, + "step": 18574 + }, + { + "epoch": 2.362930924818725, + "grad_norm": 1.5737677812576294, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8825790882110596, + "num_tokens": 708594586.0, + "step": 18575 + }, + { + "epoch": 2.363058135097316, + "grad_norm": 1.5004873275756836, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8902199268341064, + "num_tokens": 708634266.0, + "step": 18576 + }, + { + "epoch": 2.363185345375906, + "grad_norm": 1.6129815578460693, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.875691294670105, + "num_tokens": 708676188.0, + "step": 18577 + }, + { + "epoch": 2.3633125556544967, + "grad_norm": 1.4798130989074707, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8882395625114441, + "num_tokens": 708717409.0, + "step": 18578 + }, + { + "epoch": 2.3634397659330872, + "grad_norm": 1.6118693351745605, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8894869089126587, + "num_tokens": 708752511.0, + "step": 18579 + }, + { + "epoch": 2.3635669762116778, + "grad_norm": 1.578973412513733, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8910704255104065, + "num_tokens": 708790750.0, + "step": 18580 + }, + { + "epoch": 2.3636941864902683, + "grad_norm": 1.548269510269165, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8743317127227783, + "num_tokens": 708831068.0, + "step": 18581 + }, + { + "epoch": 2.363821396768859, + "grad_norm": 1.5491615533828735, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8788219690322876, + "num_tokens": 708868363.0, + "step": 18582 + }, + { + "epoch": 2.3639486070474494, + "grad_norm": 1.6483994722366333, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.884941816329956, + "num_tokens": 708902981.0, + "step": 18583 + }, + { + "epoch": 2.36407581732604, + "grad_norm": 1.4954662322998047, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8908070921897888, + "num_tokens": 708942817.0, + "step": 18584 + }, + { + "epoch": 2.3642030276046304, + "grad_norm": 1.597637414932251, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8908853530883789, + "num_tokens": 708978480.0, + "step": 18585 + }, + { + "epoch": 2.364330237883221, + "grad_norm": 1.6995564699172974, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8847962021827698, + "num_tokens": 709012889.0, + "step": 18586 + }, + { + "epoch": 2.3644574481618115, + "grad_norm": 1.5997298955917358, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8935716152191162, + "num_tokens": 709048560.0, + "step": 18587 + }, + { + "epoch": 2.364584658440402, + "grad_norm": 1.6484017372131348, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8775376081466675, + "num_tokens": 709085816.0, + "step": 18588 + }, + { + "epoch": 2.3647118687189925, + "grad_norm": 1.4699008464813232, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8906296491622925, + "num_tokens": 709123128.0, + "step": 18589 + }, + { + "epoch": 2.364839078997583, + "grad_norm": 1.4366211891174316, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8805283904075623, + "num_tokens": 709167757.0, + "step": 18590 + }, + { + "epoch": 2.3649662892761736, + "grad_norm": 1.6222503185272217, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8887474536895752, + "num_tokens": 709204698.0, + "step": 18591 + }, + { + "epoch": 2.365093499554764, + "grad_norm": 1.5675736665725708, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8818141222000122, + "num_tokens": 709240437.0, + "step": 18592 + }, + { + "epoch": 2.3652207098333546, + "grad_norm": 1.5383814573287964, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.9011765122413635, + "num_tokens": 709275928.0, + "step": 18593 + }, + { + "epoch": 2.365347920111945, + "grad_norm": 1.4825791120529175, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8934979438781738, + "num_tokens": 709314183.0, + "step": 18594 + }, + { + "epoch": 2.3654751303905357, + "grad_norm": 1.5739617347717285, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8771846890449524, + "num_tokens": 709354278.0, + "step": 18595 + }, + { + "epoch": 2.3656023406691262, + "grad_norm": 1.4750698804855347, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8904077410697937, + "num_tokens": 709395522.0, + "step": 18596 + }, + { + "epoch": 2.3657295509477168, + "grad_norm": 1.461031198501587, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8911832571029663, + "num_tokens": 709436620.0, + "step": 18597 + }, + { + "epoch": 2.365856761226307, + "grad_norm": 1.6206594705581665, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8887174129486084, + "num_tokens": 709470369.0, + "step": 18598 + }, + { + "epoch": 2.365983971504898, + "grad_norm": 1.6765495538711548, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8892711400985718, + "num_tokens": 709508457.0, + "step": 18599 + }, + { + "epoch": 2.366111181783488, + "grad_norm": 1.6538670063018799, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8849877119064331, + "num_tokens": 709540705.0, + "step": 18600 + }, + { + "epoch": 2.3662383920620784, + "grad_norm": 1.5425084829330444, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8888435959815979, + "num_tokens": 709579244.0, + "step": 18601 + }, + { + "epoch": 2.366365602340669, + "grad_norm": 1.5276877880096436, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8849043846130371, + "num_tokens": 709620215.0, + "step": 18602 + }, + { + "epoch": 2.3664928126192595, + "grad_norm": 1.5425280332565308, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8868659138679504, + "num_tokens": 709658723.0, + "step": 18603 + }, + { + "epoch": 2.36662002289785, + "grad_norm": 1.439617395401001, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8930795192718506, + "num_tokens": 709699989.0, + "step": 18604 + }, + { + "epoch": 2.3667472331764405, + "grad_norm": 1.453997254371643, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8813124299049377, + "num_tokens": 709745349.0, + "step": 18605 + }, + { + "epoch": 2.366874443455031, + "grad_norm": 1.5780024528503418, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8910260200500488, + "num_tokens": 709782333.0, + "step": 18606 + }, + { + "epoch": 2.3670016537336216, + "grad_norm": 1.6145706176757812, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.890506386756897, + "num_tokens": 709813413.0, + "step": 18607 + }, + { + "epoch": 2.367128864012212, + "grad_norm": 1.6163307428359985, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8858655691146851, + "num_tokens": 709851128.0, + "step": 18608 + }, + { + "epoch": 2.3672560742908026, + "grad_norm": 1.4948079586029053, + "learning_rate": 1e-06, + "loss": 0.2742, + "mean_token_accuracy": 0.9015724062919617, + "num_tokens": 709888608.0, + "step": 18609 + }, + { + "epoch": 2.367383284569393, + "grad_norm": 1.5458340644836426, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8889394402503967, + "num_tokens": 709924954.0, + "step": 18610 + }, + { + "epoch": 2.3675104948479837, + "grad_norm": 1.4569625854492188, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8685170412063599, + "num_tokens": 709973856.0, + "step": 18611 + }, + { + "epoch": 2.3676377051265742, + "grad_norm": 1.506801962852478, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8850415945053101, + "num_tokens": 710013485.0, + "step": 18612 + }, + { + "epoch": 2.3677649154051648, + "grad_norm": 1.6783814430236816, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8759604096412659, + "num_tokens": 710050723.0, + "step": 18613 + }, + { + "epoch": 2.3678921256837553, + "grad_norm": 1.667772650718689, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8699896335601807, + "num_tokens": 710088301.0, + "step": 18614 + }, + { + "epoch": 2.368019335962346, + "grad_norm": 1.5040563344955444, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8889892101287842, + "num_tokens": 710126442.0, + "step": 18615 + }, + { + "epoch": 2.3681465462409363, + "grad_norm": 1.5429306030273438, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.879692554473877, + "num_tokens": 710164957.0, + "step": 18616 + }, + { + "epoch": 2.368273756519527, + "grad_norm": 1.6103254556655884, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8971083760261536, + "num_tokens": 710198200.0, + "step": 18617 + }, + { + "epoch": 2.3684009667981174, + "grad_norm": 1.548642873764038, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8833960294723511, + "num_tokens": 710238206.0, + "step": 18618 + }, + { + "epoch": 2.368528177076708, + "grad_norm": 1.6702779531478882, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8861753940582275, + "num_tokens": 710274156.0, + "step": 18619 + }, + { + "epoch": 2.3686553873552985, + "grad_norm": 1.680336594581604, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8811686038970947, + "num_tokens": 710308818.0, + "step": 18620 + }, + { + "epoch": 2.368782597633889, + "grad_norm": 1.5737111568450928, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8802472352981567, + "num_tokens": 710351172.0, + "step": 18621 + }, + { + "epoch": 2.3689098079124795, + "grad_norm": 1.5579426288604736, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8694139122962952, + "num_tokens": 710391290.0, + "step": 18622 + }, + { + "epoch": 2.3690370181910696, + "grad_norm": 1.6963896751403809, + "learning_rate": 1e-06, + "loss": 0.2759, + "mean_token_accuracy": 0.8989244699478149, + "num_tokens": 710425293.0, + "step": 18623 + }, + { + "epoch": 2.3691642284696606, + "grad_norm": 1.5247318744659424, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8797382712364197, + "num_tokens": 710466407.0, + "step": 18624 + }, + { + "epoch": 2.3692914387482507, + "grad_norm": 1.6304051876068115, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8871538043022156, + "num_tokens": 710502560.0, + "step": 18625 + }, + { + "epoch": 2.369418649026841, + "grad_norm": 1.628601312637329, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8817218542098999, + "num_tokens": 710541653.0, + "step": 18626 + }, + { + "epoch": 2.3695458593054317, + "grad_norm": 1.528633713722229, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8860888481140137, + "num_tokens": 710579967.0, + "step": 18627 + }, + { + "epoch": 2.3696730695840222, + "grad_norm": 1.730754017829895, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.880361020565033, + "num_tokens": 710613863.0, + "step": 18628 + }, + { + "epoch": 2.3698002798626128, + "grad_norm": 1.6427961587905884, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8806790113449097, + "num_tokens": 710651686.0, + "step": 18629 + }, + { + "epoch": 2.3699274901412033, + "grad_norm": 1.6823514699935913, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8852895498275757, + "num_tokens": 710687241.0, + "step": 18630 + }, + { + "epoch": 2.370054700419794, + "grad_norm": 1.6380536556243896, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8815129399299622, + "num_tokens": 710726132.0, + "step": 18631 + }, + { + "epoch": 2.3701819106983844, + "grad_norm": 1.5395585298538208, + "learning_rate": 1e-06, + "loss": 0.2633, + "mean_token_accuracy": 0.9039875864982605, + "num_tokens": 710762884.0, + "step": 18632 + }, + { + "epoch": 2.370309120976975, + "grad_norm": 1.5742831230163574, + "learning_rate": 1e-06, + "loss": 0.283, + "mean_token_accuracy": 0.8986192941665649, + "num_tokens": 710800813.0, + "step": 18633 + }, + { + "epoch": 2.3704363312555654, + "grad_norm": 1.5785024166107178, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8832741975784302, + "num_tokens": 710840785.0, + "step": 18634 + }, + { + "epoch": 2.370563541534156, + "grad_norm": 1.4452404975891113, + "learning_rate": 1e-06, + "loss": 0.275, + "mean_token_accuracy": 0.9014495015144348, + "num_tokens": 710879025.0, + "step": 18635 + }, + { + "epoch": 2.3706907518127465, + "grad_norm": 1.547236680984497, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.892830491065979, + "num_tokens": 710919747.0, + "step": 18636 + }, + { + "epoch": 2.370817962091337, + "grad_norm": 1.532912254333496, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8842519521713257, + "num_tokens": 710958636.0, + "step": 18637 + }, + { + "epoch": 2.3709451723699275, + "grad_norm": 1.5664427280426025, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8880364298820496, + "num_tokens": 710995387.0, + "step": 18638 + }, + { + "epoch": 2.371072382648518, + "grad_norm": 1.5272130966186523, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8756682872772217, + "num_tokens": 711039743.0, + "step": 18639 + }, + { + "epoch": 2.3711995929271086, + "grad_norm": 1.4288793802261353, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8918699622154236, + "num_tokens": 711082914.0, + "step": 18640 + }, + { + "epoch": 2.371326803205699, + "grad_norm": 1.4491071701049805, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8895648717880249, + "num_tokens": 711123107.0, + "step": 18641 + }, + { + "epoch": 2.3714540134842896, + "grad_norm": 1.476985216140747, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.8972207307815552, + "num_tokens": 711162840.0, + "step": 18642 + }, + { + "epoch": 2.37158122376288, + "grad_norm": 1.5341635942459106, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.8917879462242126, + "num_tokens": 711199313.0, + "step": 18643 + }, + { + "epoch": 2.3717084340414707, + "grad_norm": 1.5423002243041992, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8735861778259277, + "num_tokens": 711239926.0, + "step": 18644 + }, + { + "epoch": 2.371835644320061, + "grad_norm": 1.7181602716445923, + "learning_rate": 1e-06, + "loss": 0.2681, + "mean_token_accuracy": 0.9008383750915527, + "num_tokens": 711267946.0, + "step": 18645 + }, + { + "epoch": 2.3719628545986517, + "grad_norm": 1.6347817182540894, + "learning_rate": 1e-06, + "loss": 0.2731, + "mean_token_accuracy": 0.9045039415359497, + "num_tokens": 711298402.0, + "step": 18646 + }, + { + "epoch": 2.3720900648772423, + "grad_norm": 1.4637627601623535, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8885263800621033, + "num_tokens": 711341032.0, + "step": 18647 + }, + { + "epoch": 2.3722172751558324, + "grad_norm": 1.390091896057129, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8917299509048462, + "num_tokens": 711387310.0, + "step": 18648 + }, + { + "epoch": 2.3723444854344233, + "grad_norm": 1.478982925415039, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8814963102340698, + "num_tokens": 711432710.0, + "step": 18649 + }, + { + "epoch": 2.3724716957130134, + "grad_norm": 1.4417859315872192, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8891996741294861, + "num_tokens": 711473665.0, + "step": 18650 + }, + { + "epoch": 2.372598905991604, + "grad_norm": 1.5953052043914795, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8891897797584534, + "num_tokens": 711508682.0, + "step": 18651 + }, + { + "epoch": 2.3727261162701945, + "grad_norm": 1.5425264835357666, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8842867016792297, + "num_tokens": 711548036.0, + "step": 18652 + }, + { + "epoch": 2.372853326548785, + "grad_norm": 1.4928596019744873, + "learning_rate": 1e-06, + "loss": 0.2833, + "mean_token_accuracy": 0.8996601700782776, + "num_tokens": 711586802.0, + "step": 18653 + }, + { + "epoch": 2.3729805368273755, + "grad_norm": 1.5920891761779785, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8762268424034119, + "num_tokens": 711625204.0, + "step": 18654 + }, + { + "epoch": 2.373107747105966, + "grad_norm": 1.724914789199829, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8896563053131104, + "num_tokens": 711656413.0, + "step": 18655 + }, + { + "epoch": 2.3732349573845566, + "grad_norm": 1.8138574361801147, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8823540210723877, + "num_tokens": 711687567.0, + "step": 18656 + }, + { + "epoch": 2.373362167663147, + "grad_norm": 1.6905732154846191, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.878165602684021, + "num_tokens": 711722679.0, + "step": 18657 + }, + { + "epoch": 2.3734893779417376, + "grad_norm": 1.545077919960022, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8744982481002808, + "num_tokens": 711766658.0, + "step": 18658 + }, + { + "epoch": 2.373616588220328, + "grad_norm": 1.665601134300232, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.888475775718689, + "num_tokens": 711799371.0, + "step": 18659 + }, + { + "epoch": 2.3737437984989187, + "grad_norm": 1.6478214263916016, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8799391984939575, + "num_tokens": 711838752.0, + "step": 18660 + }, + { + "epoch": 2.3738710087775092, + "grad_norm": 1.7457863092422485, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.891525387763977, + "num_tokens": 711873308.0, + "step": 18661 + }, + { + "epoch": 2.3739982190560998, + "grad_norm": 1.5014362335205078, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8900073766708374, + "num_tokens": 711913093.0, + "step": 18662 + }, + { + "epoch": 2.3741254293346903, + "grad_norm": 1.4683526754379272, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8837209939956665, + "num_tokens": 711955401.0, + "step": 18663 + }, + { + "epoch": 2.374252639613281, + "grad_norm": 1.6104732751846313, + "learning_rate": 1e-06, + "loss": 0.2827, + "mean_token_accuracy": 0.8976244926452637, + "num_tokens": 711988255.0, + "step": 18664 + }, + { + "epoch": 2.3743798498918713, + "grad_norm": 1.4339709281921387, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.8950479030609131, + "num_tokens": 712032166.0, + "step": 18665 + }, + { + "epoch": 2.374507060170462, + "grad_norm": 1.5532174110412598, + "learning_rate": 1e-06, + "loss": 0.2763, + "mean_token_accuracy": 0.8976815938949585, + "num_tokens": 712068167.0, + "step": 18666 + }, + { + "epoch": 2.3746342704490524, + "grad_norm": 1.4951151609420776, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8849163055419922, + "num_tokens": 712105819.0, + "step": 18667 + }, + { + "epoch": 2.374761480727643, + "grad_norm": 1.6641801595687866, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.874533474445343, + "num_tokens": 712143096.0, + "step": 18668 + }, + { + "epoch": 2.3748886910062335, + "grad_norm": 1.5311163663864136, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8802570104598999, + "num_tokens": 712184299.0, + "step": 18669 + }, + { + "epoch": 2.375015901284824, + "grad_norm": 1.6330808401107788, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8749271631240845, + "num_tokens": 712223294.0, + "step": 18670 + }, + { + "epoch": 2.3751431115634145, + "grad_norm": 1.4772826433181763, + "learning_rate": 1e-06, + "loss": 0.2679, + "mean_token_accuracy": 0.9034581184387207, + "num_tokens": 712260558.0, + "step": 18671 + }, + { + "epoch": 2.375270321842005, + "grad_norm": 1.6065911054611206, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8775191307067871, + "num_tokens": 712301144.0, + "step": 18672 + }, + { + "epoch": 2.375397532120595, + "grad_norm": 1.4116542339324951, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8877677917480469, + "num_tokens": 712343350.0, + "step": 18673 + }, + { + "epoch": 2.375524742399186, + "grad_norm": 1.570595145225525, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8942179083824158, + "num_tokens": 712380126.0, + "step": 18674 + }, + { + "epoch": 2.375651952677776, + "grad_norm": 1.6563318967819214, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8796653747558594, + "num_tokens": 712413894.0, + "step": 18675 + }, + { + "epoch": 2.3757791629563667, + "grad_norm": 1.5848149061203003, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8834845423698425, + "num_tokens": 712454240.0, + "step": 18676 + }, + { + "epoch": 2.3759063732349572, + "grad_norm": 1.7428115606307983, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.875114917755127, + "num_tokens": 712498093.0, + "step": 18677 + }, + { + "epoch": 2.3760335835135478, + "grad_norm": 1.7774561643600464, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8816931247711182, + "num_tokens": 712531296.0, + "step": 18678 + }, + { + "epoch": 2.3761607937921383, + "grad_norm": 1.5029075145721436, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.890900731086731, + "num_tokens": 712571335.0, + "step": 18679 + }, + { + "epoch": 2.376288004070729, + "grad_norm": 1.6719969511032104, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8659096956253052, + "num_tokens": 712609606.0, + "step": 18680 + }, + { + "epoch": 2.3764152143493193, + "grad_norm": 1.6758345365524292, + "learning_rate": 1e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.8950585126876831, + "num_tokens": 712642187.0, + "step": 18681 + }, + { + "epoch": 2.37654242462791, + "grad_norm": 1.44683039188385, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8804665207862854, + "num_tokens": 712685798.0, + "step": 18682 + }, + { + "epoch": 2.3766696349065004, + "grad_norm": 1.4878778457641602, + "learning_rate": 1e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.8951135873794556, + "num_tokens": 712726617.0, + "step": 18683 + }, + { + "epoch": 2.376796845185091, + "grad_norm": 1.5133552551269531, + "learning_rate": 1e-06, + "loss": 0.2658, + "mean_token_accuracy": 0.9026747941970825, + "num_tokens": 712760955.0, + "step": 18684 + }, + { + "epoch": 2.3769240554636815, + "grad_norm": 1.4944528341293335, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8865967988967896, + "num_tokens": 712798980.0, + "step": 18685 + }, + { + "epoch": 2.377051265742272, + "grad_norm": 1.6977922916412354, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8838363885879517, + "num_tokens": 712835341.0, + "step": 18686 + }, + { + "epoch": 2.3771784760208625, + "grad_norm": 1.4913794994354248, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8852752447128296, + "num_tokens": 712873571.0, + "step": 18687 + }, + { + "epoch": 2.377305686299453, + "grad_norm": 1.486599087715149, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8903182744979858, + "num_tokens": 712913644.0, + "step": 18688 + }, + { + "epoch": 2.3774328965780436, + "grad_norm": 1.5260355472564697, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8754858374595642, + "num_tokens": 712954962.0, + "step": 18689 + }, + { + "epoch": 2.377560106856634, + "grad_norm": 1.4704830646514893, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8957680463790894, + "num_tokens": 712997398.0, + "step": 18690 + }, + { + "epoch": 2.3776873171352246, + "grad_norm": 1.7291619777679443, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8743870854377747, + "num_tokens": 713032334.0, + "step": 18691 + }, + { + "epoch": 2.377814527413815, + "grad_norm": 1.3999608755111694, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.9020302295684814, + "num_tokens": 713074078.0, + "step": 18692 + }, + { + "epoch": 2.3779417376924057, + "grad_norm": 1.4575223922729492, + "learning_rate": 1e-06, + "loss": 0.292, + "mean_token_accuracy": 0.8942040205001831, + "num_tokens": 713116478.0, + "step": 18693 + }, + { + "epoch": 2.378068947970996, + "grad_norm": 1.749655842781067, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8750633001327515, + "num_tokens": 713150559.0, + "step": 18694 + }, + { + "epoch": 2.3781961582495867, + "grad_norm": 1.6220237016677856, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8721587657928467, + "num_tokens": 713187098.0, + "step": 18695 + }, + { + "epoch": 2.378323368528177, + "grad_norm": 1.6359772682189941, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8833606243133545, + "num_tokens": 713223917.0, + "step": 18696 + }, + { + "epoch": 2.378450578806768, + "grad_norm": 1.5648661851882935, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8897123336791992, + "num_tokens": 713263559.0, + "step": 18697 + }, + { + "epoch": 2.378577789085358, + "grad_norm": 1.5287705659866333, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.893115758895874, + "num_tokens": 713302975.0, + "step": 18698 + }, + { + "epoch": 2.3787049993639484, + "grad_norm": 1.5541588068008423, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8944282531738281, + "num_tokens": 713339003.0, + "step": 18699 + }, + { + "epoch": 2.378832209642539, + "grad_norm": 1.5783824920654297, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8873871564865112, + "num_tokens": 713375558.0, + "step": 18700 + }, + { + "epoch": 2.3789594199211295, + "grad_norm": 1.6000292301177979, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8734395503997803, + "num_tokens": 713415776.0, + "step": 18701 + }, + { + "epoch": 2.37908663019972, + "grad_norm": 1.588720440864563, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8829437494277954, + "num_tokens": 713455898.0, + "step": 18702 + }, + { + "epoch": 2.3792138404783105, + "grad_norm": 1.4985650777816772, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.895317554473877, + "num_tokens": 713494634.0, + "step": 18703 + }, + { + "epoch": 2.379341050756901, + "grad_norm": 1.6023582220077515, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8840557932853699, + "num_tokens": 713532936.0, + "step": 18704 + }, + { + "epoch": 2.3794682610354916, + "grad_norm": 1.6222432851791382, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.883599042892456, + "num_tokens": 713570942.0, + "step": 18705 + }, + { + "epoch": 2.379595471314082, + "grad_norm": 1.6386123895645142, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8636540770530701, + "num_tokens": 713611707.0, + "step": 18706 + }, + { + "epoch": 2.3797226815926726, + "grad_norm": 1.5321036577224731, + "learning_rate": 1e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.8935839533805847, + "num_tokens": 713650742.0, + "step": 18707 + }, + { + "epoch": 2.379849891871263, + "grad_norm": 1.5506277084350586, + "learning_rate": 1e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.8979091644287109, + "num_tokens": 713686119.0, + "step": 18708 + }, + { + "epoch": 2.3799771021498537, + "grad_norm": 1.6931496858596802, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.874574601650238, + "num_tokens": 713721705.0, + "step": 18709 + }, + { + "epoch": 2.3801043124284442, + "grad_norm": 1.4611854553222656, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8788032531738281, + "num_tokens": 713766940.0, + "step": 18710 + }, + { + "epoch": 2.3802315227070348, + "grad_norm": 1.5114721059799194, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8785151839256287, + "num_tokens": 713805967.0, + "step": 18711 + }, + { + "epoch": 2.3803587329856253, + "grad_norm": 1.8310139179229736, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8836816549301147, + "num_tokens": 713836534.0, + "step": 18712 + }, + { + "epoch": 2.380485943264216, + "grad_norm": 1.6437987089157104, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.882941484451294, + "num_tokens": 713872537.0, + "step": 18713 + }, + { + "epoch": 2.3806131535428063, + "grad_norm": 1.564273715019226, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8857365846633911, + "num_tokens": 713911976.0, + "step": 18714 + }, + { + "epoch": 2.380740363821397, + "grad_norm": 1.4758027791976929, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8748073577880859, + "num_tokens": 713953699.0, + "step": 18715 + }, + { + "epoch": 2.3808675740999874, + "grad_norm": 1.5485966205596924, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8881510496139526, + "num_tokens": 713994024.0, + "step": 18716 + }, + { + "epoch": 2.380994784378578, + "grad_norm": 1.6201910972595215, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8953371644020081, + "num_tokens": 714028013.0, + "step": 18717 + }, + { + "epoch": 2.3811219946571685, + "grad_norm": 1.599640965461731, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8919194340705872, + "num_tokens": 714066881.0, + "step": 18718 + }, + { + "epoch": 2.381249204935759, + "grad_norm": 1.5672672986984253, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8810087442398071, + "num_tokens": 714106455.0, + "step": 18719 + }, + { + "epoch": 2.3813764152143495, + "grad_norm": 1.5867758989334106, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8936901092529297, + "num_tokens": 714143666.0, + "step": 18720 + }, + { + "epoch": 2.3815036254929396, + "grad_norm": 1.560575008392334, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8937779664993286, + "num_tokens": 714179058.0, + "step": 18721 + }, + { + "epoch": 2.3816308357715306, + "grad_norm": 1.577375054359436, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8900787234306335, + "num_tokens": 714217641.0, + "step": 18722 + }, + { + "epoch": 2.3817580460501206, + "grad_norm": 1.813359022140503, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8774685859680176, + "num_tokens": 714253151.0, + "step": 18723 + }, + { + "epoch": 2.381885256328711, + "grad_norm": 1.6736687421798706, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.887924313545227, + "num_tokens": 714290960.0, + "step": 18724 + }, + { + "epoch": 2.3820124666073017, + "grad_norm": 1.5973087549209595, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8791160583496094, + "num_tokens": 714328967.0, + "step": 18725 + }, + { + "epoch": 2.3821396768858922, + "grad_norm": 1.783124327659607, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.887565016746521, + "num_tokens": 714364477.0, + "step": 18726 + }, + { + "epoch": 2.3822668871644828, + "grad_norm": 1.5886783599853516, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8741393089294434, + "num_tokens": 714409971.0, + "step": 18727 + }, + { + "epoch": 2.3823940974430733, + "grad_norm": 1.5206085443496704, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8908946514129639, + "num_tokens": 714450304.0, + "step": 18728 + }, + { + "epoch": 2.382521307721664, + "grad_norm": 1.5244346857070923, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8970250487327576, + "num_tokens": 714491875.0, + "step": 18729 + }, + { + "epoch": 2.3826485180002543, + "grad_norm": 1.5400508642196655, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8918195962905884, + "num_tokens": 714533929.0, + "step": 18730 + }, + { + "epoch": 2.382775728278845, + "grad_norm": 1.7347139120101929, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8878492116928101, + "num_tokens": 714566687.0, + "step": 18731 + }, + { + "epoch": 2.3829029385574354, + "grad_norm": 1.5331815481185913, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8806884288787842, + "num_tokens": 714607413.0, + "step": 18732 + }, + { + "epoch": 2.383030148836026, + "grad_norm": 1.7236915826797485, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8759332895278931, + "num_tokens": 714644917.0, + "step": 18733 + }, + { + "epoch": 2.3831573591146165, + "grad_norm": 1.529410481452942, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8891587853431702, + "num_tokens": 714681600.0, + "step": 18734 + }, + { + "epoch": 2.383284569393207, + "grad_norm": 1.5011577606201172, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8901393413543701, + "num_tokens": 714719995.0, + "step": 18735 + }, + { + "epoch": 2.3834117796717975, + "grad_norm": 1.4417520761489868, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8906455039978027, + "num_tokens": 714763544.0, + "step": 18736 + }, + { + "epoch": 2.383538989950388, + "grad_norm": 1.6902213096618652, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8761805891990662, + "num_tokens": 714799221.0, + "step": 18737 + }, + { + "epoch": 2.3836662002289786, + "grad_norm": 1.540713906288147, + "learning_rate": 1e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.8913067579269409, + "num_tokens": 714834582.0, + "step": 18738 + }, + { + "epoch": 2.383793410507569, + "grad_norm": 1.4657081365585327, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8788674473762512, + "num_tokens": 714876818.0, + "step": 18739 + }, + { + "epoch": 2.3839206207861596, + "grad_norm": 1.627597451210022, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8732566833496094, + "num_tokens": 714916265.0, + "step": 18740 + }, + { + "epoch": 2.38404783106475, + "grad_norm": 1.7261346578598022, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8793334364891052, + "num_tokens": 714950307.0, + "step": 18741 + }, + { + "epoch": 2.3841750413433407, + "grad_norm": 1.5575307607650757, + "learning_rate": 1e-06, + "loss": 0.2773, + "mean_token_accuracy": 0.899412214756012, + "num_tokens": 714984166.0, + "step": 18742 + }, + { + "epoch": 2.384302251621931, + "grad_norm": 1.6036971807479858, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8799358606338501, + "num_tokens": 715025560.0, + "step": 18743 + }, + { + "epoch": 2.3844294619005217, + "grad_norm": 1.7329634428024292, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8630437254905701, + "num_tokens": 715059291.0, + "step": 18744 + }, + { + "epoch": 2.3845566721791123, + "grad_norm": 1.614237904548645, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8768966197967529, + "num_tokens": 715097840.0, + "step": 18745 + }, + { + "epoch": 2.3846838824577024, + "grad_norm": 1.6778268814086914, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8856876492500305, + "num_tokens": 715134133.0, + "step": 18746 + }, + { + "epoch": 2.3848110927362933, + "grad_norm": 1.566004991531372, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.892364501953125, + "num_tokens": 715170115.0, + "step": 18747 + }, + { + "epoch": 2.3849383030148834, + "grad_norm": 1.6250500679016113, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8815022706985474, + "num_tokens": 715205844.0, + "step": 18748 + }, + { + "epoch": 2.385065513293474, + "grad_norm": 1.5664130449295044, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8781417608261108, + "num_tokens": 715244736.0, + "step": 18749 + }, + { + "epoch": 2.3851927235720645, + "grad_norm": 1.6169133186340332, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.886861264705658, + "num_tokens": 715282868.0, + "step": 18750 + }, + { + "epoch": 2.385319933850655, + "grad_norm": 1.5466136932373047, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8703261017799377, + "num_tokens": 715328894.0, + "step": 18751 + }, + { + "epoch": 2.3854471441292455, + "grad_norm": 1.5866450071334839, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.8903388977050781, + "num_tokens": 715365820.0, + "step": 18752 + }, + { + "epoch": 2.385574354407836, + "grad_norm": 1.5694892406463623, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8855642080307007, + "num_tokens": 715409295.0, + "step": 18753 + }, + { + "epoch": 2.3857015646864266, + "grad_norm": 1.5702934265136719, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8875861167907715, + "num_tokens": 715448016.0, + "step": 18754 + }, + { + "epoch": 2.385828774965017, + "grad_norm": 1.607975721359253, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8901965022087097, + "num_tokens": 715483734.0, + "step": 18755 + }, + { + "epoch": 2.3859559852436076, + "grad_norm": 1.5024545192718506, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8850116729736328, + "num_tokens": 715523274.0, + "step": 18756 + }, + { + "epoch": 2.386083195522198, + "grad_norm": 1.6002508401870728, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8960053324699402, + "num_tokens": 715558467.0, + "step": 18757 + }, + { + "epoch": 2.3862104058007887, + "grad_norm": 1.6287696361541748, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8927997946739197, + "num_tokens": 715594260.0, + "step": 18758 + }, + { + "epoch": 2.386337616079379, + "grad_norm": 1.6588923931121826, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8900482654571533, + "num_tokens": 715632008.0, + "step": 18759 + }, + { + "epoch": 2.3864648263579697, + "grad_norm": 1.4049875736236572, + "learning_rate": 1e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.8926849365234375, + "num_tokens": 715674416.0, + "step": 18760 + }, + { + "epoch": 2.3865920366365603, + "grad_norm": 1.724281668663025, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8833866715431213, + "num_tokens": 715706953.0, + "step": 18761 + }, + { + "epoch": 2.386719246915151, + "grad_norm": 1.6947356462478638, + "learning_rate": 1e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.8911523818969727, + "num_tokens": 715738652.0, + "step": 18762 + }, + { + "epoch": 2.3868464571937413, + "grad_norm": 1.3375976085662842, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8821853399276733, + "num_tokens": 715790951.0, + "step": 18763 + }, + { + "epoch": 2.386973667472332, + "grad_norm": 1.567667007446289, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8946419954299927, + "num_tokens": 715826584.0, + "step": 18764 + }, + { + "epoch": 2.3871008777509224, + "grad_norm": 1.525241494178772, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8863358497619629, + "num_tokens": 715866188.0, + "step": 18765 + }, + { + "epoch": 2.387228088029513, + "grad_norm": 1.618249535560608, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.876162052154541, + "num_tokens": 715907765.0, + "step": 18766 + }, + { + "epoch": 2.3873552983081034, + "grad_norm": 1.539280652999878, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8861991763114929, + "num_tokens": 715945641.0, + "step": 18767 + }, + { + "epoch": 2.387482508586694, + "grad_norm": 1.5190125703811646, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8845515251159668, + "num_tokens": 715987028.0, + "step": 18768 + }, + { + "epoch": 2.3876097188652845, + "grad_norm": 1.6917129755020142, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.877606987953186, + "num_tokens": 716022094.0, + "step": 18769 + }, + { + "epoch": 2.387736929143875, + "grad_norm": 1.6297433376312256, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8657974004745483, + "num_tokens": 716061732.0, + "step": 18770 + }, + { + "epoch": 2.387864139422465, + "grad_norm": 1.5086889266967773, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8832962512969971, + "num_tokens": 716102180.0, + "step": 18771 + }, + { + "epoch": 2.387991349701056, + "grad_norm": 1.57137930393219, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8871890306472778, + "num_tokens": 716142225.0, + "step": 18772 + }, + { + "epoch": 2.388118559979646, + "grad_norm": 1.440155267715454, + "learning_rate": 1e-06, + "loss": 0.2661, + "mean_token_accuracy": 0.9026118516921997, + "num_tokens": 716177770.0, + "step": 18773 + }, + { + "epoch": 2.3882457702582367, + "grad_norm": 1.5531563758850098, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8721667528152466, + "num_tokens": 716220663.0, + "step": 18774 + }, + { + "epoch": 2.3883729805368272, + "grad_norm": 1.5251156091690063, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8913741111755371, + "num_tokens": 716257498.0, + "step": 18775 + }, + { + "epoch": 2.3885001908154178, + "grad_norm": 1.5608552694320679, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8792870044708252, + "num_tokens": 716298944.0, + "step": 18776 + }, + { + "epoch": 2.3886274010940083, + "grad_norm": 1.7225090265274048, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8807423114776611, + "num_tokens": 716333318.0, + "step": 18777 + }, + { + "epoch": 2.388754611372599, + "grad_norm": 1.6991863250732422, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8658455014228821, + "num_tokens": 716371988.0, + "step": 18778 + }, + { + "epoch": 2.3888818216511893, + "grad_norm": 1.6139492988586426, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8915203809738159, + "num_tokens": 716406833.0, + "step": 18779 + }, + { + "epoch": 2.38900903192978, + "grad_norm": 1.5445034503936768, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8877415060997009, + "num_tokens": 716442424.0, + "step": 18780 + }, + { + "epoch": 2.3891362422083704, + "grad_norm": 1.6079179048538208, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8908945918083191, + "num_tokens": 716474696.0, + "step": 18781 + }, + { + "epoch": 2.389263452486961, + "grad_norm": 1.4250166416168213, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8962050676345825, + "num_tokens": 716516561.0, + "step": 18782 + }, + { + "epoch": 2.3893906627655515, + "grad_norm": 1.6430565118789673, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8858441114425659, + "num_tokens": 716551875.0, + "step": 18783 + }, + { + "epoch": 2.389517873044142, + "grad_norm": 1.5752235651016235, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8785120844841003, + "num_tokens": 716589585.0, + "step": 18784 + }, + { + "epoch": 2.3896450833227325, + "grad_norm": 1.547052025794983, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8743607997894287, + "num_tokens": 716632415.0, + "step": 18785 + }, + { + "epoch": 2.389772293601323, + "grad_norm": 1.5613982677459717, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8942803144454956, + "num_tokens": 716670673.0, + "step": 18786 + }, + { + "epoch": 2.3898995038799136, + "grad_norm": 1.5552442073822021, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8881727457046509, + "num_tokens": 716708797.0, + "step": 18787 + }, + { + "epoch": 2.390026714158504, + "grad_norm": 1.9285812377929688, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8894619345664978, + "num_tokens": 716751777.0, + "step": 18788 + }, + { + "epoch": 2.3901539244370946, + "grad_norm": 1.6699641942977905, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8876128196716309, + "num_tokens": 716788092.0, + "step": 18789 + }, + { + "epoch": 2.390281134715685, + "grad_norm": 1.6478111743927002, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8901081085205078, + "num_tokens": 716823446.0, + "step": 18790 + }, + { + "epoch": 2.3904083449942757, + "grad_norm": 1.6354610919952393, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.88398677110672, + "num_tokens": 716858920.0, + "step": 18791 + }, + { + "epoch": 2.390535555272866, + "grad_norm": 1.5205665826797485, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8726032972335815, + "num_tokens": 716902257.0, + "step": 18792 + }, + { + "epoch": 2.3906627655514567, + "grad_norm": 1.4645893573760986, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8885403275489807, + "num_tokens": 716943327.0, + "step": 18793 + }, + { + "epoch": 2.390789975830047, + "grad_norm": 1.4726608991622925, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8835381865501404, + "num_tokens": 716982964.0, + "step": 18794 + }, + { + "epoch": 2.390917186108638, + "grad_norm": 1.4975981712341309, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8771027326583862, + "num_tokens": 717026735.0, + "step": 18795 + }, + { + "epoch": 2.391044396387228, + "grad_norm": 1.5361589193344116, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8897973299026489, + "num_tokens": 717064615.0, + "step": 18796 + }, + { + "epoch": 2.3911716066658184, + "grad_norm": 1.6257638931274414, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8830172419548035, + "num_tokens": 717102144.0, + "step": 18797 + }, + { + "epoch": 2.391298816944409, + "grad_norm": 1.538619041442871, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8822476863861084, + "num_tokens": 717140436.0, + "step": 18798 + }, + { + "epoch": 2.3914260272229995, + "grad_norm": 1.6212421655654907, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8797288537025452, + "num_tokens": 717181684.0, + "step": 18799 + }, + { + "epoch": 2.39155323750159, + "grad_norm": 1.5378350019454956, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8773950338363647, + "num_tokens": 717220125.0, + "step": 18800 + }, + { + "epoch": 2.3916804477801805, + "grad_norm": 1.504136562347412, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8822095394134521, + "num_tokens": 717262951.0, + "step": 18801 + }, + { + "epoch": 2.391807658058771, + "grad_norm": 1.6760382652282715, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8749459385871887, + "num_tokens": 717295814.0, + "step": 18802 + }, + { + "epoch": 2.3919348683373616, + "grad_norm": 1.545975685119629, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8834296464920044, + "num_tokens": 717329492.0, + "step": 18803 + }, + { + "epoch": 2.392062078615952, + "grad_norm": 1.510599136352539, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8893360495567322, + "num_tokens": 717368070.0, + "step": 18804 + }, + { + "epoch": 2.3921892888945426, + "grad_norm": 1.6012552976608276, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8799148797988892, + "num_tokens": 717408431.0, + "step": 18805 + }, + { + "epoch": 2.392316499173133, + "grad_norm": 1.4521015882492065, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8744204044342041, + "num_tokens": 717453380.0, + "step": 18806 + }, + { + "epoch": 2.3924437094517237, + "grad_norm": 1.4933454990386963, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8779061436653137, + "num_tokens": 717491628.0, + "step": 18807 + }, + { + "epoch": 2.392570919730314, + "grad_norm": 1.4442654848098755, + "learning_rate": 1e-06, + "loss": 0.2797, + "mean_token_accuracy": 0.8997659087181091, + "num_tokens": 717530132.0, + "step": 18808 + }, + { + "epoch": 2.3926981300089047, + "grad_norm": 1.6900042295455933, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8763086795806885, + "num_tokens": 717564114.0, + "step": 18809 + }, + { + "epoch": 2.3928253402874953, + "grad_norm": 1.4239168167114258, + "learning_rate": 1e-06, + "loss": 0.2673, + "mean_token_accuracy": 0.9007756114006042, + "num_tokens": 717605478.0, + "step": 18810 + }, + { + "epoch": 2.392952550566086, + "grad_norm": 1.5743517875671387, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8841548562049866, + "num_tokens": 717643889.0, + "step": 18811 + }, + { + "epoch": 2.3930797608446763, + "grad_norm": 1.7282031774520874, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8877686262130737, + "num_tokens": 717675824.0, + "step": 18812 + }, + { + "epoch": 2.393206971123267, + "grad_norm": 1.5912518501281738, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8928484916687012, + "num_tokens": 717714081.0, + "step": 18813 + }, + { + "epoch": 2.3933341814018574, + "grad_norm": 1.5413395166397095, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8843686580657959, + "num_tokens": 717756482.0, + "step": 18814 + }, + { + "epoch": 2.393461391680448, + "grad_norm": 1.761818289756775, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8747064471244812, + "num_tokens": 717795042.0, + "step": 18815 + }, + { + "epoch": 2.3935886019590384, + "grad_norm": 1.5762102603912354, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8806891441345215, + "num_tokens": 717836088.0, + "step": 18816 + }, + { + "epoch": 2.393715812237629, + "grad_norm": 1.39066743850708, + "learning_rate": 1e-06, + "loss": 0.2668, + "mean_token_accuracy": 0.9054995775222778, + "num_tokens": 717877762.0, + "step": 18817 + }, + { + "epoch": 2.3938430225162195, + "grad_norm": 1.6833200454711914, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8905000686645508, + "num_tokens": 717915628.0, + "step": 18818 + }, + { + "epoch": 2.3939702327948096, + "grad_norm": 1.7262238264083862, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8790040016174316, + "num_tokens": 717946940.0, + "step": 18819 + }, + { + "epoch": 2.3940974430734006, + "grad_norm": 1.4846560955047607, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8879036903381348, + "num_tokens": 717990230.0, + "step": 18820 + }, + { + "epoch": 2.3942246533519906, + "grad_norm": 1.6290963888168335, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8813521862030029, + "num_tokens": 718028704.0, + "step": 18821 + }, + { + "epoch": 2.394351863630581, + "grad_norm": 1.6777560710906982, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8918675184249878, + "num_tokens": 718061332.0, + "step": 18822 + }, + { + "epoch": 2.3944790739091717, + "grad_norm": 1.5614116191864014, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8922445774078369, + "num_tokens": 718098980.0, + "step": 18823 + }, + { + "epoch": 2.3946062841877622, + "grad_norm": 1.516708254814148, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8944677114486694, + "num_tokens": 718135999.0, + "step": 18824 + }, + { + "epoch": 2.3947334944663528, + "grad_norm": 1.4532524347305298, + "learning_rate": 1e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.8971421718597412, + "num_tokens": 718173756.0, + "step": 18825 + }, + { + "epoch": 2.3948607047449433, + "grad_norm": 1.588888168334961, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8702128529548645, + "num_tokens": 718212615.0, + "step": 18826 + }, + { + "epoch": 2.394987915023534, + "grad_norm": 1.493657112121582, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8863376975059509, + "num_tokens": 718254136.0, + "step": 18827 + }, + { + "epoch": 2.3951151253021243, + "grad_norm": 1.5647306442260742, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.895842432975769, + "num_tokens": 718291925.0, + "step": 18828 + }, + { + "epoch": 2.395242335580715, + "grad_norm": 1.804555892944336, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8758347034454346, + "num_tokens": 718330080.0, + "step": 18829 + }, + { + "epoch": 2.3953695458593054, + "grad_norm": 1.4872592687606812, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8801648020744324, + "num_tokens": 718372658.0, + "step": 18830 + }, + { + "epoch": 2.395496756137896, + "grad_norm": 1.4029767513275146, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8923599720001221, + "num_tokens": 718417213.0, + "step": 18831 + }, + { + "epoch": 2.3956239664164865, + "grad_norm": 1.6647071838378906, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8844932317733765, + "num_tokens": 718451626.0, + "step": 18832 + }, + { + "epoch": 2.395751176695077, + "grad_norm": 1.5680058002471924, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8733426928520203, + "num_tokens": 718492392.0, + "step": 18833 + }, + { + "epoch": 2.3958783869736675, + "grad_norm": 1.4699686765670776, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8802285194396973, + "num_tokens": 718534727.0, + "step": 18834 + }, + { + "epoch": 2.396005597252258, + "grad_norm": 1.6227678060531616, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8838086724281311, + "num_tokens": 718571899.0, + "step": 18835 + }, + { + "epoch": 2.3961328075308486, + "grad_norm": 1.4799295663833618, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8859995603561401, + "num_tokens": 718613685.0, + "step": 18836 + }, + { + "epoch": 2.396260017809439, + "grad_norm": 1.456791877746582, + "learning_rate": 1e-06, + "loss": 0.2836, + "mean_token_accuracy": 0.8986934423446655, + "num_tokens": 718654095.0, + "step": 18837 + }, + { + "epoch": 2.3963872280880296, + "grad_norm": 1.584978461265564, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8843409419059753, + "num_tokens": 718692441.0, + "step": 18838 + }, + { + "epoch": 2.39651443836662, + "grad_norm": 1.3966528177261353, + "learning_rate": 1e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.8979669213294983, + "num_tokens": 718734986.0, + "step": 18839 + }, + { + "epoch": 2.3966416486452107, + "grad_norm": 1.7358453273773193, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8712320923805237, + "num_tokens": 718771170.0, + "step": 18840 + }, + { + "epoch": 2.396768858923801, + "grad_norm": 1.4978981018066406, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8944200277328491, + "num_tokens": 718811854.0, + "step": 18841 + }, + { + "epoch": 2.3968960692023917, + "grad_norm": 1.5358318090438843, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8894283771514893, + "num_tokens": 718848934.0, + "step": 18842 + }, + { + "epoch": 2.3970232794809823, + "grad_norm": 1.5891187191009521, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8817852735519409, + "num_tokens": 718884913.0, + "step": 18843 + }, + { + "epoch": 2.3971504897595723, + "grad_norm": 1.5823450088500977, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8713855743408203, + "num_tokens": 718923293.0, + "step": 18844 + }, + { + "epoch": 2.3972777000381633, + "grad_norm": 1.4213944673538208, + "learning_rate": 1e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.8933833241462708, + "num_tokens": 718964105.0, + "step": 18845 + }, + { + "epoch": 2.3974049103167534, + "grad_norm": 1.4964139461517334, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8845702409744263, + "num_tokens": 719006800.0, + "step": 18846 + }, + { + "epoch": 2.397532120595344, + "grad_norm": 1.7217217683792114, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8935844898223877, + "num_tokens": 719038507.0, + "step": 18847 + }, + { + "epoch": 2.3976593308739345, + "grad_norm": 1.7936043739318848, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8822891712188721, + "num_tokens": 719072259.0, + "step": 18848 + }, + { + "epoch": 2.397786541152525, + "grad_norm": 1.4873255491256714, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8852468729019165, + "num_tokens": 719116972.0, + "step": 18849 + }, + { + "epoch": 2.3979137514311155, + "grad_norm": 1.5182074308395386, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8961149454116821, + "num_tokens": 719155683.0, + "step": 18850 + }, + { + "epoch": 2.398040961709706, + "grad_norm": 1.553405523300171, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8694463968276978, + "num_tokens": 719197344.0, + "step": 18851 + }, + { + "epoch": 2.3981681719882966, + "grad_norm": 1.4248253107070923, + "learning_rate": 1e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.8966554403305054, + "num_tokens": 719238624.0, + "step": 18852 + }, + { + "epoch": 2.398295382266887, + "grad_norm": 1.7701829671859741, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8786085844039917, + "num_tokens": 719269644.0, + "step": 18853 + }, + { + "epoch": 2.3984225925454776, + "grad_norm": 1.4559931755065918, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8877719640731812, + "num_tokens": 719311630.0, + "step": 18854 + }, + { + "epoch": 2.398549802824068, + "grad_norm": 1.5193344354629517, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8901636600494385, + "num_tokens": 719350019.0, + "step": 18855 + }, + { + "epoch": 2.3986770131026587, + "grad_norm": 1.5978590250015259, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8601878881454468, + "num_tokens": 719391155.0, + "step": 18856 + }, + { + "epoch": 2.398804223381249, + "grad_norm": 1.6108945608139038, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8867347240447998, + "num_tokens": 719426808.0, + "step": 18857 + }, + { + "epoch": 2.3989314336598397, + "grad_norm": 1.6780273914337158, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8843622207641602, + "num_tokens": 719463641.0, + "step": 18858 + }, + { + "epoch": 2.3990586439384303, + "grad_norm": 1.7692874670028687, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8905801177024841, + "num_tokens": 719492970.0, + "step": 18859 + }, + { + "epoch": 2.399185854217021, + "grad_norm": 1.4973978996276855, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.878412127494812, + "num_tokens": 719538517.0, + "step": 18860 + }, + { + "epoch": 2.3993130644956113, + "grad_norm": 1.6394906044006348, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8803483843803406, + "num_tokens": 719576786.0, + "step": 18861 + }, + { + "epoch": 2.399440274774202, + "grad_norm": 1.4787880182266235, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8665127158164978, + "num_tokens": 719623513.0, + "step": 18862 + }, + { + "epoch": 2.3995674850527924, + "grad_norm": 1.7113556861877441, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8741106986999512, + "num_tokens": 719657775.0, + "step": 18863 + }, + { + "epoch": 2.399694695331383, + "grad_norm": 1.6032720804214478, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8881133198738098, + "num_tokens": 719697633.0, + "step": 18864 + }, + { + "epoch": 2.3998219056099734, + "grad_norm": 1.5193113088607788, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8960357308387756, + "num_tokens": 719736821.0, + "step": 18865 + }, + { + "epoch": 2.399949115888564, + "grad_norm": 1.6159703731536865, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8837028741836548, + "num_tokens": 719775117.0, + "step": 18866 + }, + { + "epoch": 2.4000763261671545, + "grad_norm": 1.6450754404067993, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.885249674320221, + "num_tokens": 719810298.0, + "step": 18867 + }, + { + "epoch": 2.400203536445745, + "grad_norm": 1.5150060653686523, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8848844766616821, + "num_tokens": 719851868.0, + "step": 18868 + }, + { + "epoch": 2.400330746724335, + "grad_norm": 1.6111716032028198, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.89664626121521, + "num_tokens": 719888238.0, + "step": 18869 + }, + { + "epoch": 2.400457957002926, + "grad_norm": 1.5433969497680664, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8870432376861572, + "num_tokens": 719928724.0, + "step": 18870 + }, + { + "epoch": 2.400585167281516, + "grad_norm": 1.5399361848831177, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8855440616607666, + "num_tokens": 719965890.0, + "step": 18871 + }, + { + "epoch": 2.4007123775601067, + "grad_norm": 1.4659045934677124, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8852729797363281, + "num_tokens": 720008086.0, + "step": 18872 + }, + { + "epoch": 2.400839587838697, + "grad_norm": 1.3560117483139038, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8971502780914307, + "num_tokens": 720051774.0, + "step": 18873 + }, + { + "epoch": 2.4009667981172877, + "grad_norm": 1.6328165531158447, + "learning_rate": 1e-06, + "loss": 0.2713, + "mean_token_accuracy": 0.8997912406921387, + "num_tokens": 720085139.0, + "step": 18874 + }, + { + "epoch": 2.4010940083958783, + "grad_norm": 1.6417797803878784, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8898391723632812, + "num_tokens": 720120045.0, + "step": 18875 + }, + { + "epoch": 2.401221218674469, + "grad_norm": 1.7284793853759766, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8662497997283936, + "num_tokens": 720153507.0, + "step": 18876 + }, + { + "epoch": 2.4013484289530593, + "grad_norm": 1.626323938369751, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.888094961643219, + "num_tokens": 720189062.0, + "step": 18877 + }, + { + "epoch": 2.40147563923165, + "grad_norm": 1.3813847303390503, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8901764154434204, + "num_tokens": 720233356.0, + "step": 18878 + }, + { + "epoch": 2.4016028495102404, + "grad_norm": 1.7784314155578613, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.883675754070282, + "num_tokens": 720269317.0, + "step": 18879 + }, + { + "epoch": 2.401730059788831, + "grad_norm": 1.5098631381988525, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8862212896347046, + "num_tokens": 720310976.0, + "step": 18880 + }, + { + "epoch": 2.4018572700674214, + "grad_norm": 1.4443293809890747, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8852978944778442, + "num_tokens": 720351818.0, + "step": 18881 + }, + { + "epoch": 2.401984480346012, + "grad_norm": 1.5719518661499023, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.890365481376648, + "num_tokens": 720386091.0, + "step": 18882 + }, + { + "epoch": 2.4021116906246025, + "grad_norm": 1.6427123546600342, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8723448514938354, + "num_tokens": 720424376.0, + "step": 18883 + }, + { + "epoch": 2.402238900903193, + "grad_norm": 1.5769100189208984, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8887535929679871, + "num_tokens": 720462834.0, + "step": 18884 + }, + { + "epoch": 2.4023661111817836, + "grad_norm": 1.682765245437622, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8877663612365723, + "num_tokens": 720496272.0, + "step": 18885 + }, + { + "epoch": 2.402493321460374, + "grad_norm": 1.5691255331039429, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.886215090751648, + "num_tokens": 720535875.0, + "step": 18886 + }, + { + "epoch": 2.4026205317389646, + "grad_norm": 1.6841206550598145, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.88150554895401, + "num_tokens": 720570231.0, + "step": 18887 + }, + { + "epoch": 2.402747742017555, + "grad_norm": 1.4184027910232544, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8889638185501099, + "num_tokens": 720615113.0, + "step": 18888 + }, + { + "epoch": 2.4028749522961457, + "grad_norm": 1.5166220664978027, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.880095362663269, + "num_tokens": 720655777.0, + "step": 18889 + }, + { + "epoch": 2.403002162574736, + "grad_norm": 1.581493854522705, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8802857398986816, + "num_tokens": 720695188.0, + "step": 18890 + }, + { + "epoch": 2.4031293728533267, + "grad_norm": 1.4565119743347168, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8914384841918945, + "num_tokens": 720738273.0, + "step": 18891 + }, + { + "epoch": 2.403256583131917, + "grad_norm": 1.539810061454773, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8913848400115967, + "num_tokens": 720774625.0, + "step": 18892 + }, + { + "epoch": 2.403383793410508, + "grad_norm": 1.4520457983016968, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.896806001663208, + "num_tokens": 720812770.0, + "step": 18893 + }, + { + "epoch": 2.403511003689098, + "grad_norm": 1.650445818901062, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8821905255317688, + "num_tokens": 720847964.0, + "step": 18894 + }, + { + "epoch": 2.4036382139676884, + "grad_norm": 1.5994324684143066, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8865891098976135, + "num_tokens": 720884234.0, + "step": 18895 + }, + { + "epoch": 2.403765424246279, + "grad_norm": 1.686570644378662, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8692536950111389, + "num_tokens": 720920410.0, + "step": 18896 + }, + { + "epoch": 2.4038926345248695, + "grad_norm": 1.501491665840149, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8950359225273132, + "num_tokens": 720957834.0, + "step": 18897 + }, + { + "epoch": 2.40401984480346, + "grad_norm": 1.6265499591827393, + "learning_rate": 1e-06, + "loss": 0.2787, + "mean_token_accuracy": 0.8984829783439636, + "num_tokens": 720994115.0, + "step": 18898 + }, + { + "epoch": 2.4041470550820505, + "grad_norm": 1.5907701253890991, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8802269697189331, + "num_tokens": 721031711.0, + "step": 18899 + }, + { + "epoch": 2.404274265360641, + "grad_norm": 1.65418541431427, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8805835247039795, + "num_tokens": 721066142.0, + "step": 18900 + }, + { + "epoch": 2.4044014756392316, + "grad_norm": 1.3852053880691528, + "learning_rate": 1e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.8977237939834595, + "num_tokens": 721109297.0, + "step": 18901 + }, + { + "epoch": 2.404528685917822, + "grad_norm": 1.5326344966888428, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8853318691253662, + "num_tokens": 721150497.0, + "step": 18902 + }, + { + "epoch": 2.4046558961964126, + "grad_norm": 1.5785212516784668, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8919410705566406, + "num_tokens": 721188041.0, + "step": 18903 + }, + { + "epoch": 2.404783106475003, + "grad_norm": 1.5997387170791626, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8827294707298279, + "num_tokens": 721227260.0, + "step": 18904 + }, + { + "epoch": 2.4049103167535937, + "grad_norm": 1.7345077991485596, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8742259740829468, + "num_tokens": 721261264.0, + "step": 18905 + }, + { + "epoch": 2.405037527032184, + "grad_norm": 1.4917638301849365, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8828707933425903, + "num_tokens": 721304642.0, + "step": 18906 + }, + { + "epoch": 2.4051647373107747, + "grad_norm": 1.683442234992981, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8733980655670166, + "num_tokens": 721341236.0, + "step": 18907 + }, + { + "epoch": 2.4052919475893653, + "grad_norm": 1.6509588956832886, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.888008177280426, + "num_tokens": 721373615.0, + "step": 18908 + }, + { + "epoch": 2.405419157867956, + "grad_norm": 1.4955592155456543, + "learning_rate": 1e-06, + "loss": 0.2858, + "mean_token_accuracy": 0.8953101634979248, + "num_tokens": 721413037.0, + "step": 18909 + }, + { + "epoch": 2.4055463681465463, + "grad_norm": 1.4922008514404297, + "learning_rate": 1e-06, + "loss": 0.2663, + "mean_token_accuracy": 0.9031673669815063, + "num_tokens": 721447211.0, + "step": 18910 + }, + { + "epoch": 2.405673578425137, + "grad_norm": 1.5109059810638428, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.887816309928894, + "num_tokens": 721489315.0, + "step": 18911 + }, + { + "epoch": 2.4058007887037274, + "grad_norm": 1.585381269454956, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8874911665916443, + "num_tokens": 721528251.0, + "step": 18912 + }, + { + "epoch": 2.405927998982318, + "grad_norm": 1.607733130455017, + "learning_rate": 1e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.898515522480011, + "num_tokens": 721562183.0, + "step": 18913 + }, + { + "epoch": 2.4060552092609084, + "grad_norm": 1.377114176750183, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.889474093914032, + "num_tokens": 721606944.0, + "step": 18914 + }, + { + "epoch": 2.406182419539499, + "grad_norm": 1.5376145839691162, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8822327852249146, + "num_tokens": 721644631.0, + "step": 18915 + }, + { + "epoch": 2.4063096298180895, + "grad_norm": 1.4620015621185303, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8963398933410645, + "num_tokens": 721687687.0, + "step": 18916 + }, + { + "epoch": 2.4064368400966796, + "grad_norm": 1.5401477813720703, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8931528925895691, + "num_tokens": 721725681.0, + "step": 18917 + }, + { + "epoch": 2.4065640503752705, + "grad_norm": 1.612138032913208, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8927502632141113, + "num_tokens": 721758181.0, + "step": 18918 + }, + { + "epoch": 2.4066912606538606, + "grad_norm": 1.6329302787780762, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8797279000282288, + "num_tokens": 721793002.0, + "step": 18919 + }, + { + "epoch": 2.406818470932451, + "grad_norm": 1.5201942920684814, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8906325101852417, + "num_tokens": 721829832.0, + "step": 18920 + }, + { + "epoch": 2.4069456812110417, + "grad_norm": 1.515699863433838, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8720993399620056, + "num_tokens": 721872731.0, + "step": 18921 + }, + { + "epoch": 2.407072891489632, + "grad_norm": 1.604160189628601, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8772681951522827, + "num_tokens": 721907288.0, + "step": 18922 + }, + { + "epoch": 2.4072001017682227, + "grad_norm": 1.6161034107208252, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8904158473014832, + "num_tokens": 721944362.0, + "step": 18923 + }, + { + "epoch": 2.4073273120468133, + "grad_norm": 1.673663854598999, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8912357091903687, + "num_tokens": 721976895.0, + "step": 18924 + }, + { + "epoch": 2.407454522325404, + "grad_norm": 1.9063348770141602, + "learning_rate": 1e-06, + "loss": 0.295, + "mean_token_accuracy": 0.8914907574653625, + "num_tokens": 722002400.0, + "step": 18925 + }, + { + "epoch": 2.4075817326039943, + "grad_norm": 1.6676546335220337, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8781275153160095, + "num_tokens": 722036784.0, + "step": 18926 + }, + { + "epoch": 2.407708942882585, + "grad_norm": 1.4998363256454468, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8945684432983398, + "num_tokens": 722078032.0, + "step": 18927 + }, + { + "epoch": 2.4078361531611754, + "grad_norm": 1.6384153366088867, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8885528445243835, + "num_tokens": 722113829.0, + "step": 18928 + }, + { + "epoch": 2.407963363439766, + "grad_norm": 1.526197075843811, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8724305033683777, + "num_tokens": 722158062.0, + "step": 18929 + }, + { + "epoch": 2.4080905737183564, + "grad_norm": 1.6246587038040161, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.876111626625061, + "num_tokens": 722195081.0, + "step": 18930 + }, + { + "epoch": 2.408217783996947, + "grad_norm": 1.635148525238037, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8803502917289734, + "num_tokens": 722231714.0, + "step": 18931 + }, + { + "epoch": 2.4083449942755375, + "grad_norm": 1.5434538125991821, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8928210139274597, + "num_tokens": 722271211.0, + "step": 18932 + }, + { + "epoch": 2.408472204554128, + "grad_norm": 1.6605485677719116, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8825188875198364, + "num_tokens": 722307543.0, + "step": 18933 + }, + { + "epoch": 2.4085994148327186, + "grad_norm": 1.4935604333877563, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8843172788619995, + "num_tokens": 722350718.0, + "step": 18934 + }, + { + "epoch": 2.408726625111309, + "grad_norm": 1.3922353982925415, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8881368637084961, + "num_tokens": 722400051.0, + "step": 18935 + }, + { + "epoch": 2.4088538353898996, + "grad_norm": 1.4901610612869263, + "learning_rate": 1e-06, + "loss": 0.25, + "mean_token_accuracy": 0.9072051048278809, + "num_tokens": 722434664.0, + "step": 18936 + }, + { + "epoch": 2.40898104566849, + "grad_norm": 1.4638005495071411, + "learning_rate": 1e-06, + "loss": 0.2714, + "mean_token_accuracy": 0.9002529382705688, + "num_tokens": 722480485.0, + "step": 18937 + }, + { + "epoch": 2.4091082559470807, + "grad_norm": 1.7183423042297363, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8831634521484375, + "num_tokens": 722513814.0, + "step": 18938 + }, + { + "epoch": 2.409235466225671, + "grad_norm": 1.5144602060317993, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8890742063522339, + "num_tokens": 722552213.0, + "step": 18939 + }, + { + "epoch": 2.4093626765042617, + "grad_norm": 1.645658254623413, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8949050903320312, + "num_tokens": 722585515.0, + "step": 18940 + }, + { + "epoch": 2.4094898867828523, + "grad_norm": 1.4798589944839478, + "learning_rate": 1e-06, + "loss": 0.2676, + "mean_token_accuracy": 0.9028750061988831, + "num_tokens": 722626124.0, + "step": 18941 + }, + { + "epoch": 2.4096170970614423, + "grad_norm": 1.3210506439208984, + "learning_rate": 1e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.8943960666656494, + "num_tokens": 722671393.0, + "step": 18942 + }, + { + "epoch": 2.4097443073400333, + "grad_norm": 1.7562837600708008, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8927359580993652, + "num_tokens": 722702005.0, + "step": 18943 + }, + { + "epoch": 2.4098715176186234, + "grad_norm": 1.563554286956787, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.875710129737854, + "num_tokens": 722742330.0, + "step": 18944 + }, + { + "epoch": 2.409998727897214, + "grad_norm": 1.5496727228164673, + "learning_rate": 1e-06, + "loss": 0.2807, + "mean_token_accuracy": 0.899172842502594, + "num_tokens": 722776183.0, + "step": 18945 + }, + { + "epoch": 2.4101259381758044, + "grad_norm": 1.4215494394302368, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8886749744415283, + "num_tokens": 722819749.0, + "step": 18946 + }, + { + "epoch": 2.410253148454395, + "grad_norm": 1.5170906782150269, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8728055953979492, + "num_tokens": 722865164.0, + "step": 18947 + }, + { + "epoch": 2.4103803587329855, + "grad_norm": 1.42215895652771, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8902701139450073, + "num_tokens": 722910496.0, + "step": 18948 + }, + { + "epoch": 2.410507569011576, + "grad_norm": 1.6678791046142578, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8823330402374268, + "num_tokens": 722948021.0, + "step": 18949 + }, + { + "epoch": 2.4106347792901666, + "grad_norm": 1.5843061208724976, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.889991283416748, + "num_tokens": 722982689.0, + "step": 18950 + }, + { + "epoch": 2.410761989568757, + "grad_norm": 1.6084553003311157, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8739073276519775, + "num_tokens": 723020906.0, + "step": 18951 + }, + { + "epoch": 2.4108891998473476, + "grad_norm": 1.7926232814788818, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8876725435256958, + "num_tokens": 723048681.0, + "step": 18952 + }, + { + "epoch": 2.411016410125938, + "grad_norm": 1.6494237184524536, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.864648699760437, + "num_tokens": 723086956.0, + "step": 18953 + }, + { + "epoch": 2.4111436204045287, + "grad_norm": 1.6062802076339722, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8826994299888611, + "num_tokens": 723121519.0, + "step": 18954 + }, + { + "epoch": 2.411270830683119, + "grad_norm": 1.4911751747131348, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8800119161605835, + "num_tokens": 723163698.0, + "step": 18955 + }, + { + "epoch": 2.4113980409617097, + "grad_norm": 1.7461662292480469, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8682383894920349, + "num_tokens": 723197692.0, + "step": 18956 + }, + { + "epoch": 2.4115252512403003, + "grad_norm": 1.6728895902633667, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8916631937026978, + "num_tokens": 723230174.0, + "step": 18957 + }, + { + "epoch": 2.411652461518891, + "grad_norm": 1.5134423971176147, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8843573927879333, + "num_tokens": 723269644.0, + "step": 18958 + }, + { + "epoch": 2.4117796717974813, + "grad_norm": 1.455596923828125, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8900879621505737, + "num_tokens": 723310687.0, + "step": 18959 + }, + { + "epoch": 2.411906882076072, + "grad_norm": 1.507215976715088, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8833402395248413, + "num_tokens": 723351758.0, + "step": 18960 + }, + { + "epoch": 2.4120340923546624, + "grad_norm": 1.5381784439086914, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8867216110229492, + "num_tokens": 723390723.0, + "step": 18961 + }, + { + "epoch": 2.412161302633253, + "grad_norm": 1.6298902034759521, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8729424476623535, + "num_tokens": 723430479.0, + "step": 18962 + }, + { + "epoch": 2.4122885129118434, + "grad_norm": 1.6768991947174072, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8904426097869873, + "num_tokens": 723465510.0, + "step": 18963 + }, + { + "epoch": 2.412415723190434, + "grad_norm": 1.6870957612991333, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.871422290802002, + "num_tokens": 723504819.0, + "step": 18964 + }, + { + "epoch": 2.4125429334690245, + "grad_norm": 1.699549674987793, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8745367527008057, + "num_tokens": 723543326.0, + "step": 18965 + }, + { + "epoch": 2.412670143747615, + "grad_norm": 1.5592923164367676, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8823376893997192, + "num_tokens": 723581400.0, + "step": 18966 + }, + { + "epoch": 2.412797354026205, + "grad_norm": 1.5641340017318726, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8775927424430847, + "num_tokens": 723618938.0, + "step": 18967 + }, + { + "epoch": 2.412924564304796, + "grad_norm": 1.5438532829284668, + "learning_rate": 1e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.8992609977722168, + "num_tokens": 723654698.0, + "step": 18968 + }, + { + "epoch": 2.413051774583386, + "grad_norm": 1.630078673362732, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8890630006790161, + "num_tokens": 723691809.0, + "step": 18969 + }, + { + "epoch": 2.4131789848619767, + "grad_norm": 1.634781837463379, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8870395421981812, + "num_tokens": 723727454.0, + "step": 18970 + }, + { + "epoch": 2.413306195140567, + "grad_norm": 1.5118441581726074, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8849676251411438, + "num_tokens": 723766837.0, + "step": 18971 + }, + { + "epoch": 2.4134334054191577, + "grad_norm": 1.5130189657211304, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8899989724159241, + "num_tokens": 723804298.0, + "step": 18972 + }, + { + "epoch": 2.4135606156977483, + "grad_norm": 1.3967944383621216, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8885830044746399, + "num_tokens": 723848859.0, + "step": 18973 + }, + { + "epoch": 2.413687825976339, + "grad_norm": 1.7462358474731445, + "learning_rate": 1e-06, + "loss": 0.285, + "mean_token_accuracy": 0.8959985971450806, + "num_tokens": 723878953.0, + "step": 18974 + }, + { + "epoch": 2.4138150362549293, + "grad_norm": 1.576897382736206, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.87947678565979, + "num_tokens": 723918865.0, + "step": 18975 + }, + { + "epoch": 2.41394224653352, + "grad_norm": 1.3137603998184204, + "learning_rate": 1e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.8986446857452393, + "num_tokens": 723965067.0, + "step": 18976 + }, + { + "epoch": 2.4140694568121104, + "grad_norm": 1.5507843494415283, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8889586925506592, + "num_tokens": 724003719.0, + "step": 18977 + }, + { + "epoch": 2.414196667090701, + "grad_norm": 1.7210816144943237, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8908566236495972, + "num_tokens": 724036026.0, + "step": 18978 + }, + { + "epoch": 2.4143238773692914, + "grad_norm": 1.548362374305725, + "learning_rate": 1e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.8952358961105347, + "num_tokens": 724071632.0, + "step": 18979 + }, + { + "epoch": 2.414451087647882, + "grad_norm": 1.4219183921813965, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8778570294380188, + "num_tokens": 724115535.0, + "step": 18980 + }, + { + "epoch": 2.4145782979264725, + "grad_norm": 1.4877980947494507, + "learning_rate": 1e-06, + "loss": 0.285, + "mean_token_accuracy": 0.8975293636322021, + "num_tokens": 724154515.0, + "step": 18981 + }, + { + "epoch": 2.414705508205063, + "grad_norm": 1.6225817203521729, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8908258676528931, + "num_tokens": 724192612.0, + "step": 18982 + }, + { + "epoch": 2.4148327184836536, + "grad_norm": 1.7296022176742554, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.88457852602005, + "num_tokens": 724227469.0, + "step": 18983 + }, + { + "epoch": 2.414959928762244, + "grad_norm": 1.537401795387268, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8866794109344482, + "num_tokens": 724265662.0, + "step": 18984 + }, + { + "epoch": 2.4150871390408346, + "grad_norm": 1.501332402229309, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8841185569763184, + "num_tokens": 724308461.0, + "step": 18985 + }, + { + "epoch": 2.415214349319425, + "grad_norm": 1.6103824377059937, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8711360096931458, + "num_tokens": 724346740.0, + "step": 18986 + }, + { + "epoch": 2.4153415595980157, + "grad_norm": 1.754112958908081, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.885584831237793, + "num_tokens": 724377967.0, + "step": 18987 + }, + { + "epoch": 2.415468769876606, + "grad_norm": 1.601709246635437, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8858036994934082, + "num_tokens": 724416703.0, + "step": 18988 + }, + { + "epoch": 2.4155959801551967, + "grad_norm": 1.563997507095337, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8896861672401428, + "num_tokens": 724453570.0, + "step": 18989 + }, + { + "epoch": 2.415723190433787, + "grad_norm": 1.6002306938171387, + "learning_rate": 1e-06, + "loss": 0.2739, + "mean_token_accuracy": 0.9011656641960144, + "num_tokens": 724488428.0, + "step": 18990 + }, + { + "epoch": 2.4158504007123778, + "grad_norm": 1.4600698947906494, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8850667476654053, + "num_tokens": 724533195.0, + "step": 18991 + }, + { + "epoch": 2.415977610990968, + "grad_norm": 1.4430162906646729, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.8955752849578857, + "num_tokens": 724573994.0, + "step": 18992 + }, + { + "epoch": 2.4161048212695584, + "grad_norm": 1.4720458984375, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8836503028869629, + "num_tokens": 724616037.0, + "step": 18993 + }, + { + "epoch": 2.416232031548149, + "grad_norm": 1.6275478601455688, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8920707702636719, + "num_tokens": 724651481.0, + "step": 18994 + }, + { + "epoch": 2.4163592418267394, + "grad_norm": 1.5080245733261108, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.879296064376831, + "num_tokens": 724693819.0, + "step": 18995 + }, + { + "epoch": 2.41648645210533, + "grad_norm": 1.512619137763977, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8922269344329834, + "num_tokens": 724734552.0, + "step": 18996 + }, + { + "epoch": 2.4166136623839205, + "grad_norm": 1.525694727897644, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8740672469139099, + "num_tokens": 724776635.0, + "step": 18997 + }, + { + "epoch": 2.416740872662511, + "grad_norm": 1.5656030178070068, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8918029069900513, + "num_tokens": 724813607.0, + "step": 18998 + }, + { + "epoch": 2.4168680829411016, + "grad_norm": 1.4173270463943481, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8949558138847351, + "num_tokens": 724855836.0, + "step": 18999 + }, + { + "epoch": 2.416995293219692, + "grad_norm": 1.474104404449463, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8922969102859497, + "num_tokens": 724895705.0, + "step": 19000 + }, + { + "epoch": 2.4171225034982826, + "grad_norm": 1.5876779556274414, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8872548341751099, + "num_tokens": 724932879.0, + "step": 19001 + }, + { + "epoch": 2.417249713776873, + "grad_norm": 1.7257099151611328, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8795034885406494, + "num_tokens": 724967124.0, + "step": 19002 + }, + { + "epoch": 2.4173769240554637, + "grad_norm": 1.5733952522277832, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8785877823829651, + "num_tokens": 725009074.0, + "step": 19003 + }, + { + "epoch": 2.417504134334054, + "grad_norm": 1.6169800758361816, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8834621906280518, + "num_tokens": 725047452.0, + "step": 19004 + }, + { + "epoch": 2.4176313446126447, + "grad_norm": 1.484490990638733, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.886982262134552, + "num_tokens": 725087954.0, + "step": 19005 + }, + { + "epoch": 2.4177585548912353, + "grad_norm": 1.6319682598114014, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8934564590454102, + "num_tokens": 725121826.0, + "step": 19006 + }, + { + "epoch": 2.417885765169826, + "grad_norm": 1.7160747051239014, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8795429468154907, + "num_tokens": 725158748.0, + "step": 19007 + }, + { + "epoch": 2.4180129754484163, + "grad_norm": 1.5529980659484863, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8943195343017578, + "num_tokens": 725195691.0, + "step": 19008 + }, + { + "epoch": 2.418140185727007, + "grad_norm": 1.5236530303955078, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8864433169364929, + "num_tokens": 725234535.0, + "step": 19009 + }, + { + "epoch": 2.4182673960055974, + "grad_norm": 1.5746866464614868, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8710256218910217, + "num_tokens": 725275843.0, + "step": 19010 + }, + { + "epoch": 2.418394606284188, + "grad_norm": 1.5817989110946655, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8829605579376221, + "num_tokens": 725315443.0, + "step": 19011 + }, + { + "epoch": 2.4185218165627784, + "grad_norm": 1.5726178884506226, + "learning_rate": 1e-06, + "loss": 0.2681, + "mean_token_accuracy": 0.9029129147529602, + "num_tokens": 725349465.0, + "step": 19012 + }, + { + "epoch": 2.418649026841369, + "grad_norm": 1.6416460275650024, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8752347230911255, + "num_tokens": 725389349.0, + "step": 19013 + }, + { + "epoch": 2.4187762371199595, + "grad_norm": 1.5765197277069092, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8740341663360596, + "num_tokens": 725428531.0, + "step": 19014 + }, + { + "epoch": 2.4189034473985496, + "grad_norm": 1.4325693845748901, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8901574611663818, + "num_tokens": 725473924.0, + "step": 19015 + }, + { + "epoch": 2.4190306576771405, + "grad_norm": 1.5604848861694336, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8914943933486938, + "num_tokens": 725512629.0, + "step": 19016 + }, + { + "epoch": 2.4191578679557306, + "grad_norm": 1.7668594121932983, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8754020929336548, + "num_tokens": 725545905.0, + "step": 19017 + }, + { + "epoch": 2.419285078234321, + "grad_norm": 1.546627163887024, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8778946399688721, + "num_tokens": 725589232.0, + "step": 19018 + }, + { + "epoch": 2.4194122885129117, + "grad_norm": 1.6703917980194092, + "learning_rate": 1e-06, + "loss": 0.2814, + "mean_token_accuracy": 0.8975757360458374, + "num_tokens": 725620738.0, + "step": 19019 + }, + { + "epoch": 2.419539498791502, + "grad_norm": 1.5834978818893433, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8874598741531372, + "num_tokens": 725658114.0, + "step": 19020 + }, + { + "epoch": 2.4196667090700927, + "grad_norm": 1.5813100337982178, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.889502227306366, + "num_tokens": 725696486.0, + "step": 19021 + }, + { + "epoch": 2.4197939193486833, + "grad_norm": 1.6489267349243164, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8702199459075928, + "num_tokens": 725734042.0, + "step": 19022 + }, + { + "epoch": 2.419921129627274, + "grad_norm": 1.6500308513641357, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8831305503845215, + "num_tokens": 725771632.0, + "step": 19023 + }, + { + "epoch": 2.4200483399058643, + "grad_norm": 1.5780515670776367, + "learning_rate": 1e-06, + "loss": 0.2887, + "mean_token_accuracy": 0.8931623697280884, + "num_tokens": 725803464.0, + "step": 19024 + }, + { + "epoch": 2.420175550184455, + "grad_norm": 1.560023546218872, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8851562142372131, + "num_tokens": 725839160.0, + "step": 19025 + }, + { + "epoch": 2.4203027604630454, + "grad_norm": 1.491989016532898, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8904852867126465, + "num_tokens": 725883222.0, + "step": 19026 + }, + { + "epoch": 2.420429970741636, + "grad_norm": 1.5925229787826538, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8834847807884216, + "num_tokens": 725922785.0, + "step": 19027 + }, + { + "epoch": 2.4205571810202264, + "grad_norm": 1.6531578302383423, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8818769454956055, + "num_tokens": 725963944.0, + "step": 19028 + }, + { + "epoch": 2.420684391298817, + "grad_norm": 1.5431879758834839, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8895223736763, + "num_tokens": 726001537.0, + "step": 19029 + }, + { + "epoch": 2.4208116015774075, + "grad_norm": 1.6599117517471313, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8770266771316528, + "num_tokens": 726039806.0, + "step": 19030 + }, + { + "epoch": 2.420938811855998, + "grad_norm": 1.5150409936904907, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8766683340072632, + "num_tokens": 726081797.0, + "step": 19031 + }, + { + "epoch": 2.4210660221345885, + "grad_norm": 1.4355289936065674, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.896611213684082, + "num_tokens": 726124143.0, + "step": 19032 + }, + { + "epoch": 2.421193232413179, + "grad_norm": 1.4838556051254272, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8883201479911804, + "num_tokens": 726165661.0, + "step": 19033 + }, + { + "epoch": 2.4213204426917696, + "grad_norm": 1.6995437145233154, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8885248899459839, + "num_tokens": 726198002.0, + "step": 19034 + }, + { + "epoch": 2.42144765297036, + "grad_norm": 1.509855031967163, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8849219083786011, + "num_tokens": 726239949.0, + "step": 19035 + }, + { + "epoch": 2.4215748632489507, + "grad_norm": 1.4915217161178589, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8822475671768188, + "num_tokens": 726278804.0, + "step": 19036 + }, + { + "epoch": 2.421702073527541, + "grad_norm": 1.5694212913513184, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8759315013885498, + "num_tokens": 726315753.0, + "step": 19037 + }, + { + "epoch": 2.4218292838061317, + "grad_norm": 1.6361335515975952, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8772956132888794, + "num_tokens": 726355776.0, + "step": 19038 + }, + { + "epoch": 2.4219564940847222, + "grad_norm": 1.5977013111114502, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8659230470657349, + "num_tokens": 726395788.0, + "step": 19039 + }, + { + "epoch": 2.4220837043633123, + "grad_norm": 1.596207857131958, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8828352689743042, + "num_tokens": 726433120.0, + "step": 19040 + }, + { + "epoch": 2.4222109146419033, + "grad_norm": 1.662394404411316, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8797235488891602, + "num_tokens": 726468308.0, + "step": 19041 + }, + { + "epoch": 2.4223381249204934, + "grad_norm": 1.7039438486099243, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8832213878631592, + "num_tokens": 726498825.0, + "step": 19042 + }, + { + "epoch": 2.422465335199084, + "grad_norm": 1.447506070137024, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8900155425071716, + "num_tokens": 726540926.0, + "step": 19043 + }, + { + "epoch": 2.4225925454776744, + "grad_norm": 1.4571928977966309, + "learning_rate": 1e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.8963838815689087, + "num_tokens": 726579709.0, + "step": 19044 + }, + { + "epoch": 2.422719755756265, + "grad_norm": 1.7180598974227905, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8884274959564209, + "num_tokens": 726614343.0, + "step": 19045 + }, + { + "epoch": 2.4228469660348555, + "grad_norm": 1.708511471748352, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8706634640693665, + "num_tokens": 726649987.0, + "step": 19046 + }, + { + "epoch": 2.422974176313446, + "grad_norm": 1.4840776920318604, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8921753168106079, + "num_tokens": 726689976.0, + "step": 19047 + }, + { + "epoch": 2.4231013865920366, + "grad_norm": 1.4565982818603516, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8776367902755737, + "num_tokens": 726733853.0, + "step": 19048 + }, + { + "epoch": 2.423228596870627, + "grad_norm": 1.470251441001892, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8824071884155273, + "num_tokens": 726777600.0, + "step": 19049 + }, + { + "epoch": 2.4233558071492176, + "grad_norm": 1.5112642049789429, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.892765462398529, + "num_tokens": 726814484.0, + "step": 19050 + }, + { + "epoch": 2.423483017427808, + "grad_norm": 1.5300788879394531, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8747539520263672, + "num_tokens": 726855942.0, + "step": 19051 + }, + { + "epoch": 2.4236102277063987, + "grad_norm": 1.5494858026504517, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8887308239936829, + "num_tokens": 726892273.0, + "step": 19052 + }, + { + "epoch": 2.423737437984989, + "grad_norm": 1.5800336599349976, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8859155178070068, + "num_tokens": 726931323.0, + "step": 19053 + }, + { + "epoch": 2.4238646482635797, + "grad_norm": 1.7825700044631958, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8752211928367615, + "num_tokens": 726965909.0, + "step": 19054 + }, + { + "epoch": 2.4239918585421703, + "grad_norm": 1.6076289415359497, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8909187912940979, + "num_tokens": 727002986.0, + "step": 19055 + }, + { + "epoch": 2.424119068820761, + "grad_norm": 1.5019559860229492, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8932089805603027, + "num_tokens": 727042241.0, + "step": 19056 + }, + { + "epoch": 2.4242462790993513, + "grad_norm": 1.6857160329818726, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8685466051101685, + "num_tokens": 727079045.0, + "step": 19057 + }, + { + "epoch": 2.424373489377942, + "grad_norm": 1.7705323696136475, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8703243732452393, + "num_tokens": 727114884.0, + "step": 19058 + }, + { + "epoch": 2.4245006996565324, + "grad_norm": 1.540636658668518, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8912532925605774, + "num_tokens": 727154822.0, + "step": 19059 + }, + { + "epoch": 2.424627909935123, + "grad_norm": 1.5472062826156616, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8917908668518066, + "num_tokens": 727191764.0, + "step": 19060 + }, + { + "epoch": 2.4247551202137134, + "grad_norm": 1.3969348669052124, + "learning_rate": 1e-06, + "loss": 0.2853, + "mean_token_accuracy": 0.8973497152328491, + "num_tokens": 727234875.0, + "step": 19061 + }, + { + "epoch": 2.424882330492304, + "grad_norm": 1.476048231124878, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8927145004272461, + "num_tokens": 727277165.0, + "step": 19062 + }, + { + "epoch": 2.4250095407708945, + "grad_norm": 1.3997756242752075, + "learning_rate": 1e-06, + "loss": 0.2736, + "mean_token_accuracy": 0.9005946516990662, + "num_tokens": 727320844.0, + "step": 19063 + }, + { + "epoch": 2.425136751049485, + "grad_norm": 1.529572606086731, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8921627998352051, + "num_tokens": 727359194.0, + "step": 19064 + }, + { + "epoch": 2.425263961328075, + "grad_norm": 1.3968620300292969, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8764151334762573, + "num_tokens": 727406747.0, + "step": 19065 + }, + { + "epoch": 2.425391171606666, + "grad_norm": 1.5734935998916626, + "learning_rate": 1e-06, + "loss": 0.2573, + "mean_token_accuracy": 0.9076723456382751, + "num_tokens": 727438195.0, + "step": 19066 + }, + { + "epoch": 2.425518381885256, + "grad_norm": 1.5323258638381958, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8871639966964722, + "num_tokens": 727477491.0, + "step": 19067 + }, + { + "epoch": 2.4256455921638467, + "grad_norm": 1.4210944175720215, + "learning_rate": 1e-06, + "loss": 0.2772, + "mean_token_accuracy": 0.8991474509239197, + "num_tokens": 727514837.0, + "step": 19068 + }, + { + "epoch": 2.425772802442437, + "grad_norm": 1.5413646697998047, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8781816959381104, + "num_tokens": 727554903.0, + "step": 19069 + }, + { + "epoch": 2.4259000127210277, + "grad_norm": 1.5492278337478638, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8885297775268555, + "num_tokens": 727593421.0, + "step": 19070 + }, + { + "epoch": 2.4260272229996183, + "grad_norm": 1.7720149755477905, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8769323825836182, + "num_tokens": 727628861.0, + "step": 19071 + }, + { + "epoch": 2.426154433278209, + "grad_norm": 1.688794732093811, + "learning_rate": 1e-06, + "loss": 0.2883, + "mean_token_accuracy": 0.8938111066818237, + "num_tokens": 727662993.0, + "step": 19072 + }, + { + "epoch": 2.4262816435567993, + "grad_norm": 1.622964859008789, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8858780860900879, + "num_tokens": 727698908.0, + "step": 19073 + }, + { + "epoch": 2.42640885383539, + "grad_norm": 1.487348198890686, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8883830308914185, + "num_tokens": 727739614.0, + "step": 19074 + }, + { + "epoch": 2.4265360641139804, + "grad_norm": 1.4910601377487183, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8841655850410461, + "num_tokens": 727782245.0, + "step": 19075 + }, + { + "epoch": 2.426663274392571, + "grad_norm": 1.6090277433395386, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8854355812072754, + "num_tokens": 727817652.0, + "step": 19076 + }, + { + "epoch": 2.4267904846711614, + "grad_norm": 1.6609002351760864, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8908863067626953, + "num_tokens": 727848396.0, + "step": 19077 + }, + { + "epoch": 2.426917694949752, + "grad_norm": 1.462577223777771, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8867843747138977, + "num_tokens": 727887776.0, + "step": 19078 + }, + { + "epoch": 2.4270449052283425, + "grad_norm": 1.6243504285812378, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8860166072845459, + "num_tokens": 727922775.0, + "step": 19079 + }, + { + "epoch": 2.427172115506933, + "grad_norm": 1.5142320394515991, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8759191632270813, + "num_tokens": 727971871.0, + "step": 19080 + }, + { + "epoch": 2.4272993257855235, + "grad_norm": 1.582501769065857, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8766607046127319, + "num_tokens": 728011332.0, + "step": 19081 + }, + { + "epoch": 2.427426536064114, + "grad_norm": 1.5316476821899414, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8925775289535522, + "num_tokens": 728048864.0, + "step": 19082 + }, + { + "epoch": 2.4275537463427046, + "grad_norm": 1.6612619161605835, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8673547506332397, + "num_tokens": 728088044.0, + "step": 19083 + }, + { + "epoch": 2.427680956621295, + "grad_norm": 1.6246190071105957, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8743910193443298, + "num_tokens": 728124596.0, + "step": 19084 + }, + { + "epoch": 2.4278081668998857, + "grad_norm": 1.3805162906646729, + "learning_rate": 1e-06, + "loss": 0.2675, + "mean_token_accuracy": 0.9048518538475037, + "num_tokens": 728170324.0, + "step": 19085 + }, + { + "epoch": 2.427935377178476, + "grad_norm": 1.7627862691879272, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8712437748908997, + "num_tokens": 728204090.0, + "step": 19086 + }, + { + "epoch": 2.4280625874570667, + "grad_norm": 1.5074018239974976, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8789684772491455, + "num_tokens": 728244274.0, + "step": 19087 + }, + { + "epoch": 2.428189797735657, + "grad_norm": 1.6732237339019775, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8894075155258179, + "num_tokens": 728276759.0, + "step": 19088 + }, + { + "epoch": 2.4283170080142478, + "grad_norm": 1.6310834884643555, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8677557110786438, + "num_tokens": 728313855.0, + "step": 19089 + }, + { + "epoch": 2.428444218292838, + "grad_norm": 1.623020052909851, + "learning_rate": 1e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.8941267728805542, + "num_tokens": 728347724.0, + "step": 19090 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 1.371019959449768, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8895466923713684, + "num_tokens": 728390153.0, + "step": 19091 + }, + { + "epoch": 2.428698638850019, + "grad_norm": 1.5466032028198242, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8761396408081055, + "num_tokens": 728431218.0, + "step": 19092 + }, + { + "epoch": 2.4288258491286094, + "grad_norm": 1.5019309520721436, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8909475803375244, + "num_tokens": 728469041.0, + "step": 19093 + }, + { + "epoch": 2.4289530594072, + "grad_norm": 1.5840590000152588, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.8937379121780396, + "num_tokens": 728503246.0, + "step": 19094 + }, + { + "epoch": 2.4290802696857905, + "grad_norm": 1.5983452796936035, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8880411386489868, + "num_tokens": 728535918.0, + "step": 19095 + }, + { + "epoch": 2.429207479964381, + "grad_norm": 1.5238747596740723, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8954702615737915, + "num_tokens": 728573612.0, + "step": 19096 + }, + { + "epoch": 2.4293346902429716, + "grad_norm": 1.5944191217422485, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8929224014282227, + "num_tokens": 728611249.0, + "step": 19097 + }, + { + "epoch": 2.429461900521562, + "grad_norm": 1.610892653465271, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8787785768508911, + "num_tokens": 728650743.0, + "step": 19098 + }, + { + "epoch": 2.4295891108001526, + "grad_norm": 1.611971378326416, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8839078545570374, + "num_tokens": 728687238.0, + "step": 19099 + }, + { + "epoch": 2.429716321078743, + "grad_norm": 1.6565322875976562, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8754642605781555, + "num_tokens": 728725915.0, + "step": 19100 + }, + { + "epoch": 2.4298435313573337, + "grad_norm": 1.565111756324768, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8913898468017578, + "num_tokens": 728766980.0, + "step": 19101 + }, + { + "epoch": 2.429970741635924, + "grad_norm": 1.6943089962005615, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8845493793487549, + "num_tokens": 728804559.0, + "step": 19102 + }, + { + "epoch": 2.4300979519145147, + "grad_norm": 1.4537241458892822, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8890314698219299, + "num_tokens": 728847256.0, + "step": 19103 + }, + { + "epoch": 2.4302251621931052, + "grad_norm": 1.5680646896362305, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8877993822097778, + "num_tokens": 728885657.0, + "step": 19104 + }, + { + "epoch": 2.4303523724716958, + "grad_norm": 1.4415216445922852, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8850822448730469, + "num_tokens": 728928820.0, + "step": 19105 + }, + { + "epoch": 2.4304795827502863, + "grad_norm": 1.5752646923065186, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.890806257724762, + "num_tokens": 728964415.0, + "step": 19106 + }, + { + "epoch": 2.430606793028877, + "grad_norm": 1.474792242050171, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8895152807235718, + "num_tokens": 729005046.0, + "step": 19107 + }, + { + "epoch": 2.4307340033074674, + "grad_norm": 1.6653187274932861, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8812509775161743, + "num_tokens": 729041007.0, + "step": 19108 + }, + { + "epoch": 2.430861213586058, + "grad_norm": 1.6938608884811401, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8778505325317383, + "num_tokens": 729075897.0, + "step": 19109 + }, + { + "epoch": 2.4309884238646484, + "grad_norm": 1.5168713331222534, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8859684467315674, + "num_tokens": 729120270.0, + "step": 19110 + }, + { + "epoch": 2.431115634143239, + "grad_norm": 1.710673451423645, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8756666779518127, + "num_tokens": 729155017.0, + "step": 19111 + }, + { + "epoch": 2.4312428444218295, + "grad_norm": 1.5043383836746216, + "learning_rate": 1e-06, + "loss": 0.2825, + "mean_token_accuracy": 0.8985267877578735, + "num_tokens": 729190423.0, + "step": 19112 + }, + { + "epoch": 2.4313700547004196, + "grad_norm": 1.5103033781051636, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.8929053544998169, + "num_tokens": 729228743.0, + "step": 19113 + }, + { + "epoch": 2.4314972649790105, + "grad_norm": 1.6987597942352295, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8716418743133545, + "num_tokens": 729271742.0, + "step": 19114 + }, + { + "epoch": 2.4316244752576006, + "grad_norm": 1.5436054468154907, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8727911114692688, + "num_tokens": 729314878.0, + "step": 19115 + }, + { + "epoch": 2.431751685536191, + "grad_norm": 1.70555579662323, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8658303618431091, + "num_tokens": 729348090.0, + "step": 19116 + }, + { + "epoch": 2.4318788958147817, + "grad_norm": 1.4910680055618286, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.88292396068573, + "num_tokens": 729388343.0, + "step": 19117 + }, + { + "epoch": 2.432006106093372, + "grad_norm": 1.5452700853347778, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8893392086029053, + "num_tokens": 729425782.0, + "step": 19118 + }, + { + "epoch": 2.4321333163719627, + "grad_norm": 1.5088672637939453, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8936644792556763, + "num_tokens": 729465840.0, + "step": 19119 + }, + { + "epoch": 2.4322605266505533, + "grad_norm": 1.426012396812439, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8913900852203369, + "num_tokens": 729510598.0, + "step": 19120 + }, + { + "epoch": 2.432387736929144, + "grad_norm": 1.559269905090332, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8750190138816833, + "num_tokens": 729548296.0, + "step": 19121 + }, + { + "epoch": 2.4325149472077343, + "grad_norm": 1.5091713666915894, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8785805702209473, + "num_tokens": 729587749.0, + "step": 19122 + }, + { + "epoch": 2.432642157486325, + "grad_norm": 1.5990848541259766, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8828980922698975, + "num_tokens": 729625672.0, + "step": 19123 + }, + { + "epoch": 2.4327693677649154, + "grad_norm": 1.5140955448150635, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8897562623023987, + "num_tokens": 729663904.0, + "step": 19124 + }, + { + "epoch": 2.432896578043506, + "grad_norm": 1.544736385345459, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8896763324737549, + "num_tokens": 729702415.0, + "step": 19125 + }, + { + "epoch": 2.4330237883220964, + "grad_norm": 1.5254530906677246, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8855109810829163, + "num_tokens": 729740514.0, + "step": 19126 + }, + { + "epoch": 2.433150998600687, + "grad_norm": 1.6546529531478882, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8825933933258057, + "num_tokens": 729775835.0, + "step": 19127 + }, + { + "epoch": 2.4332782088792775, + "grad_norm": 1.7397544384002686, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8781013488769531, + "num_tokens": 729811275.0, + "step": 19128 + }, + { + "epoch": 2.433405419157868, + "grad_norm": 1.6686738729476929, + "learning_rate": 1e-06, + "loss": 0.2765, + "mean_token_accuracy": 0.897777795791626, + "num_tokens": 729841947.0, + "step": 19129 + }, + { + "epoch": 2.4335326294364585, + "grad_norm": 1.6255732774734497, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8752037882804871, + "num_tokens": 729881190.0, + "step": 19130 + }, + { + "epoch": 2.433659839715049, + "grad_norm": 1.5753110647201538, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8822098970413208, + "num_tokens": 729924517.0, + "step": 19131 + }, + { + "epoch": 2.4337870499936396, + "grad_norm": 1.7124429941177368, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8938156962394714, + "num_tokens": 729956905.0, + "step": 19132 + }, + { + "epoch": 2.43391426027223, + "grad_norm": 1.4954301118850708, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.875558078289032, + "num_tokens": 729996897.0, + "step": 19133 + }, + { + "epoch": 2.4340414705508207, + "grad_norm": 1.4707016944885254, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8926529884338379, + "num_tokens": 730037844.0, + "step": 19134 + }, + { + "epoch": 2.434168680829411, + "grad_norm": 1.5892900228500366, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8775702714920044, + "num_tokens": 730077815.0, + "step": 19135 + }, + { + "epoch": 2.4342958911080017, + "grad_norm": 1.7794564962387085, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8841556310653687, + "num_tokens": 730116907.0, + "step": 19136 + }, + { + "epoch": 2.4344231013865922, + "grad_norm": 1.639411211013794, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8923687934875488, + "num_tokens": 730153531.0, + "step": 19137 + }, + { + "epoch": 2.4345503116651823, + "grad_norm": 1.7138478755950928, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8853127956390381, + "num_tokens": 730186868.0, + "step": 19138 + }, + { + "epoch": 2.4346775219437733, + "grad_norm": 1.5504971742630005, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8823280334472656, + "num_tokens": 730224718.0, + "step": 19139 + }, + { + "epoch": 2.4348047322223634, + "grad_norm": 1.531414270401001, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8872995972633362, + "num_tokens": 730266669.0, + "step": 19140 + }, + { + "epoch": 2.434931942500954, + "grad_norm": 1.6656444072723389, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8821383118629456, + "num_tokens": 730300665.0, + "step": 19141 + }, + { + "epoch": 2.4350591527795444, + "grad_norm": 1.4867080450057983, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8919659852981567, + "num_tokens": 730339243.0, + "step": 19142 + }, + { + "epoch": 2.435186363058135, + "grad_norm": 1.4689081907272339, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8819597959518433, + "num_tokens": 730379996.0, + "step": 19143 + }, + { + "epoch": 2.4353135733367255, + "grad_norm": 1.4514427185058594, + "learning_rate": 1e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.8973017930984497, + "num_tokens": 730417287.0, + "step": 19144 + }, + { + "epoch": 2.435440783615316, + "grad_norm": 1.5529143810272217, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8794693350791931, + "num_tokens": 730457283.0, + "step": 19145 + }, + { + "epoch": 2.4355679938939065, + "grad_norm": 1.4756674766540527, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8888970613479614, + "num_tokens": 730497816.0, + "step": 19146 + }, + { + "epoch": 2.435695204172497, + "grad_norm": 1.5623250007629395, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.88564133644104, + "num_tokens": 730534458.0, + "step": 19147 + }, + { + "epoch": 2.4358224144510876, + "grad_norm": 1.620191216468811, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8929619789123535, + "num_tokens": 730570796.0, + "step": 19148 + }, + { + "epoch": 2.435949624729678, + "grad_norm": 1.5735774040222168, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8881624937057495, + "num_tokens": 730608151.0, + "step": 19149 + }, + { + "epoch": 2.4360768350082687, + "grad_norm": 1.4210834503173828, + "learning_rate": 1e-06, + "loss": 0.2598, + "mean_token_accuracy": 0.9057762622833252, + "num_tokens": 730650567.0, + "step": 19150 + }, + { + "epoch": 2.436204045286859, + "grad_norm": 1.5787522792816162, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8896178603172302, + "num_tokens": 730692407.0, + "step": 19151 + }, + { + "epoch": 2.4363312555654497, + "grad_norm": 1.427614688873291, + "learning_rate": 1e-06, + "loss": 0.2809, + "mean_token_accuracy": 0.8959379196166992, + "num_tokens": 730732873.0, + "step": 19152 + }, + { + "epoch": 2.4364584658440402, + "grad_norm": 1.5693808794021606, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8895547389984131, + "num_tokens": 730771837.0, + "step": 19153 + }, + { + "epoch": 2.4365856761226308, + "grad_norm": 1.587988018989563, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8834460973739624, + "num_tokens": 730810893.0, + "step": 19154 + }, + { + "epoch": 2.4367128864012213, + "grad_norm": 1.591896414756775, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8835148811340332, + "num_tokens": 730852335.0, + "step": 19155 + }, + { + "epoch": 2.436840096679812, + "grad_norm": 1.6126426458358765, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8809893131256104, + "num_tokens": 730889306.0, + "step": 19156 + }, + { + "epoch": 2.4369673069584024, + "grad_norm": 1.5188524723052979, + "learning_rate": 1e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.90106600522995, + "num_tokens": 730927561.0, + "step": 19157 + }, + { + "epoch": 2.437094517236993, + "grad_norm": 1.4666804075241089, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8955202102661133, + "num_tokens": 730968270.0, + "step": 19158 + }, + { + "epoch": 2.4372217275155834, + "grad_norm": 1.4172885417938232, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8862264156341553, + "num_tokens": 731011130.0, + "step": 19159 + }, + { + "epoch": 2.437348937794174, + "grad_norm": 1.4736789464950562, + "learning_rate": 1e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.8968786001205444, + "num_tokens": 731051590.0, + "step": 19160 + }, + { + "epoch": 2.4374761480727645, + "grad_norm": 1.6232222318649292, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8853209018707275, + "num_tokens": 731090610.0, + "step": 19161 + }, + { + "epoch": 2.437603358351355, + "grad_norm": 1.4645557403564453, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8907178640365601, + "num_tokens": 731129374.0, + "step": 19162 + }, + { + "epoch": 2.437730568629945, + "grad_norm": 1.648635745048523, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8719074130058289, + "num_tokens": 731168417.0, + "step": 19163 + }, + { + "epoch": 2.437857778908536, + "grad_norm": 1.5660732984542847, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8910316228866577, + "num_tokens": 731206318.0, + "step": 19164 + }, + { + "epoch": 2.437984989187126, + "grad_norm": 1.5806076526641846, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8860757350921631, + "num_tokens": 731246490.0, + "step": 19165 + }, + { + "epoch": 2.4381121994657167, + "grad_norm": 1.5508743524551392, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8857936859130859, + "num_tokens": 731289242.0, + "step": 19166 + }, + { + "epoch": 2.438239409744307, + "grad_norm": 1.4822297096252441, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8899320363998413, + "num_tokens": 731329918.0, + "step": 19167 + }, + { + "epoch": 2.4383666200228977, + "grad_norm": 1.6156400442123413, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8689380288124084, + "num_tokens": 731368985.0, + "step": 19168 + }, + { + "epoch": 2.4384938303014883, + "grad_norm": 1.5746289491653442, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8836885690689087, + "num_tokens": 731405801.0, + "step": 19169 + }, + { + "epoch": 2.438621040580079, + "grad_norm": 1.408609390258789, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8820424675941467, + "num_tokens": 731450265.0, + "step": 19170 + }, + { + "epoch": 2.4387482508586693, + "grad_norm": 1.6234381198883057, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8756881952285767, + "num_tokens": 731486732.0, + "step": 19171 + }, + { + "epoch": 2.43887546113726, + "grad_norm": 1.496829867362976, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8823673725128174, + "num_tokens": 731526590.0, + "step": 19172 + }, + { + "epoch": 2.4390026714158504, + "grad_norm": 1.7730591297149658, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.886742889881134, + "num_tokens": 731558290.0, + "step": 19173 + }, + { + "epoch": 2.439129881694441, + "grad_norm": 1.5976122617721558, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.890289843082428, + "num_tokens": 731593699.0, + "step": 19174 + }, + { + "epoch": 2.4392570919730314, + "grad_norm": 1.608917236328125, + "learning_rate": 1e-06, + "loss": 0.2849, + "mean_token_accuracy": 0.8937473893165588, + "num_tokens": 731630118.0, + "step": 19175 + }, + { + "epoch": 2.439384302251622, + "grad_norm": 1.5068230628967285, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8832281231880188, + "num_tokens": 731667680.0, + "step": 19176 + }, + { + "epoch": 2.4395115125302125, + "grad_norm": 1.4082964658737183, + "learning_rate": 1e-06, + "loss": 0.2536, + "mean_token_accuracy": 0.9067387580871582, + "num_tokens": 731702253.0, + "step": 19177 + }, + { + "epoch": 2.439638722808803, + "grad_norm": 1.600938320159912, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8869812488555908, + "num_tokens": 731742358.0, + "step": 19178 + }, + { + "epoch": 2.4397659330873935, + "grad_norm": 1.5217336416244507, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8807857036590576, + "num_tokens": 731781347.0, + "step": 19179 + }, + { + "epoch": 2.439893143365984, + "grad_norm": 1.5859237909317017, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8892109394073486, + "num_tokens": 731818439.0, + "step": 19180 + }, + { + "epoch": 2.4400203536445746, + "grad_norm": 1.567728877067566, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8913288116455078, + "num_tokens": 731853407.0, + "step": 19181 + }, + { + "epoch": 2.440147563923165, + "grad_norm": 1.6026527881622314, + "learning_rate": 1e-06, + "loss": 0.285, + "mean_token_accuracy": 0.8963192105293274, + "num_tokens": 731887795.0, + "step": 19182 + }, + { + "epoch": 2.4402747742017556, + "grad_norm": 1.5809651613235474, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.892878532409668, + "num_tokens": 731924458.0, + "step": 19183 + }, + { + "epoch": 2.440401984480346, + "grad_norm": 1.629202961921692, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8723883032798767, + "num_tokens": 731959722.0, + "step": 19184 + }, + { + "epoch": 2.4405291947589367, + "grad_norm": 1.5044134855270386, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8859097957611084, + "num_tokens": 731999516.0, + "step": 19185 + }, + { + "epoch": 2.440656405037527, + "grad_norm": 1.432765245437622, + "learning_rate": 1e-06, + "loss": 0.2636, + "mean_token_accuracy": 0.9040917754173279, + "num_tokens": 732039557.0, + "step": 19186 + }, + { + "epoch": 2.4407836153161178, + "grad_norm": 1.579120397567749, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.884192705154419, + "num_tokens": 732076828.0, + "step": 19187 + }, + { + "epoch": 2.440910825594708, + "grad_norm": 1.732764482498169, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8621969223022461, + "num_tokens": 732113825.0, + "step": 19188 + }, + { + "epoch": 2.4410380358732984, + "grad_norm": 1.6259156465530396, + "learning_rate": 1e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.8981706500053406, + "num_tokens": 732144850.0, + "step": 19189 + }, + { + "epoch": 2.441165246151889, + "grad_norm": 1.761884331703186, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8863050937652588, + "num_tokens": 732175984.0, + "step": 19190 + }, + { + "epoch": 2.4412924564304794, + "grad_norm": 1.5672192573547363, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.892589807510376, + "num_tokens": 732210241.0, + "step": 19191 + }, + { + "epoch": 2.44141966670907, + "grad_norm": 1.5844556093215942, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8804200887680054, + "num_tokens": 732245711.0, + "step": 19192 + }, + { + "epoch": 2.4415468769876605, + "grad_norm": 1.5906238555908203, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8868252038955688, + "num_tokens": 732280856.0, + "step": 19193 + }, + { + "epoch": 2.441674087266251, + "grad_norm": 1.534192681312561, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8820419311523438, + "num_tokens": 732323481.0, + "step": 19194 + }, + { + "epoch": 2.4418012975448415, + "grad_norm": 1.4155631065368652, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8854215741157532, + "num_tokens": 732370739.0, + "step": 19195 + }, + { + "epoch": 2.441928507823432, + "grad_norm": 1.5487712621688843, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8854621648788452, + "num_tokens": 732413213.0, + "step": 19196 + }, + { + "epoch": 2.4420557181020226, + "grad_norm": 1.48140287399292, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8909807205200195, + "num_tokens": 732448969.0, + "step": 19197 + }, + { + "epoch": 2.442182928380613, + "grad_norm": 1.4501327276229858, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8864333033561707, + "num_tokens": 732488831.0, + "step": 19198 + }, + { + "epoch": 2.4423101386592037, + "grad_norm": 1.6287615299224854, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8905363082885742, + "num_tokens": 732521764.0, + "step": 19199 + }, + { + "epoch": 2.442437348937794, + "grad_norm": 1.6992104053497314, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8921761512756348, + "num_tokens": 732555107.0, + "step": 19200 + }, + { + "epoch": 2.4425645592163847, + "grad_norm": 1.6192485094070435, + "learning_rate": 1e-06, + "loss": 0.2777, + "mean_token_accuracy": 0.8976620435714722, + "num_tokens": 732586409.0, + "step": 19201 + }, + { + "epoch": 2.4426917694949752, + "grad_norm": 1.596823811531067, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8894546031951904, + "num_tokens": 732624624.0, + "step": 19202 + }, + { + "epoch": 2.4428189797735658, + "grad_norm": 1.563504695892334, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8889852166175842, + "num_tokens": 732663107.0, + "step": 19203 + }, + { + "epoch": 2.4429461900521563, + "grad_norm": 1.5747286081314087, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8922469615936279, + "num_tokens": 732700845.0, + "step": 19204 + }, + { + "epoch": 2.443073400330747, + "grad_norm": 1.7056775093078613, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8862549066543579, + "num_tokens": 732732849.0, + "step": 19205 + }, + { + "epoch": 2.4432006106093374, + "grad_norm": 1.6942775249481201, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8770435452461243, + "num_tokens": 732772248.0, + "step": 19206 + }, + { + "epoch": 2.443327820887928, + "grad_norm": 1.4921458959579468, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8858764171600342, + "num_tokens": 732815132.0, + "step": 19207 + }, + { + "epoch": 2.4434550311665184, + "grad_norm": 1.6254690885543823, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8831984400749207, + "num_tokens": 732852456.0, + "step": 19208 + }, + { + "epoch": 2.443582241445109, + "grad_norm": 1.5462546348571777, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8869824409484863, + "num_tokens": 732892664.0, + "step": 19209 + }, + { + "epoch": 2.4437094517236995, + "grad_norm": 1.7039859294891357, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8761506676673889, + "num_tokens": 732929439.0, + "step": 19210 + }, + { + "epoch": 2.4438366620022896, + "grad_norm": 1.6049525737762451, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.878981351852417, + "num_tokens": 732971339.0, + "step": 19211 + }, + { + "epoch": 2.4439638722808805, + "grad_norm": 1.6367104053497314, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.877677321434021, + "num_tokens": 733011698.0, + "step": 19212 + }, + { + "epoch": 2.4440910825594706, + "grad_norm": 1.5094801187515259, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8890285491943359, + "num_tokens": 733051155.0, + "step": 19213 + }, + { + "epoch": 2.444218292838061, + "grad_norm": 1.7184406518936157, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.890494704246521, + "num_tokens": 733085455.0, + "step": 19214 + }, + { + "epoch": 2.4443455031166517, + "grad_norm": 1.6903905868530273, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8924196362495422, + "num_tokens": 733116326.0, + "step": 19215 + }, + { + "epoch": 2.444472713395242, + "grad_norm": 1.4089621305465698, + "learning_rate": 1e-06, + "loss": 0.2909, + "mean_token_accuracy": 0.8955447673797607, + "num_tokens": 733158152.0, + "step": 19216 + }, + { + "epoch": 2.4445999236738327, + "grad_norm": 1.706528902053833, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8876391649246216, + "num_tokens": 733193957.0, + "step": 19217 + }, + { + "epoch": 2.4447271339524232, + "grad_norm": 1.6778255701065063, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8893794417381287, + "num_tokens": 733227878.0, + "step": 19218 + }, + { + "epoch": 2.4448543442310138, + "grad_norm": 1.750332236289978, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.888993501663208, + "num_tokens": 733258731.0, + "step": 19219 + }, + { + "epoch": 2.4449815545096043, + "grad_norm": 1.5254310369491577, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8761931657791138, + "num_tokens": 733300348.0, + "step": 19220 + }, + { + "epoch": 2.445108764788195, + "grad_norm": 1.5296307802200317, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.881338894367218, + "num_tokens": 733341273.0, + "step": 19221 + }, + { + "epoch": 2.4452359750667854, + "grad_norm": 1.68165922164917, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8752191066741943, + "num_tokens": 733380596.0, + "step": 19222 + }, + { + "epoch": 2.445363185345376, + "grad_norm": 1.5369006395339966, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.889790415763855, + "num_tokens": 733420280.0, + "step": 19223 + }, + { + "epoch": 2.4454903956239664, + "grad_norm": 1.5347267389297485, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8815773725509644, + "num_tokens": 733460215.0, + "step": 19224 + }, + { + "epoch": 2.445617605902557, + "grad_norm": 1.4670324325561523, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8862075209617615, + "num_tokens": 733500819.0, + "step": 19225 + }, + { + "epoch": 2.4457448161811475, + "grad_norm": 1.363197684288025, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.883495032787323, + "num_tokens": 733545776.0, + "step": 19226 + }, + { + "epoch": 2.445872026459738, + "grad_norm": 1.6871730089187622, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8869744539260864, + "num_tokens": 733579278.0, + "step": 19227 + }, + { + "epoch": 2.4459992367383285, + "grad_norm": 1.5622318983078003, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8821768760681152, + "num_tokens": 733617193.0, + "step": 19228 + }, + { + "epoch": 2.446126447016919, + "grad_norm": 1.6320167779922485, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8977784514427185, + "num_tokens": 733650648.0, + "step": 19229 + }, + { + "epoch": 2.4462536572955096, + "grad_norm": 1.51898992061615, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8909593820571899, + "num_tokens": 733688100.0, + "step": 19230 + }, + { + "epoch": 2.4463808675741, + "grad_norm": 1.6858261823654175, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.886809229850769, + "num_tokens": 733719642.0, + "step": 19231 + }, + { + "epoch": 2.4465080778526906, + "grad_norm": 1.7625058889389038, + "learning_rate": 1e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.8957352042198181, + "num_tokens": 733746146.0, + "step": 19232 + }, + { + "epoch": 2.446635288131281, + "grad_norm": 1.6852020025253296, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8887212872505188, + "num_tokens": 733779116.0, + "step": 19233 + }, + { + "epoch": 2.4467624984098717, + "grad_norm": 1.5496668815612793, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.893488883972168, + "num_tokens": 733816628.0, + "step": 19234 + }, + { + "epoch": 2.4468897086884622, + "grad_norm": 1.7320646047592163, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8800729513168335, + "num_tokens": 733851328.0, + "step": 19235 + }, + { + "epoch": 2.4470169189670523, + "grad_norm": 1.666056513786316, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8908599019050598, + "num_tokens": 733886218.0, + "step": 19236 + }, + { + "epoch": 2.4471441292456433, + "grad_norm": 1.482934594154358, + "learning_rate": 1e-06, + "loss": 0.277, + "mean_token_accuracy": 0.8984764814376831, + "num_tokens": 733925526.0, + "step": 19237 + }, + { + "epoch": 2.4472713395242334, + "grad_norm": 1.6848877668380737, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8745301365852356, + "num_tokens": 733962307.0, + "step": 19238 + }, + { + "epoch": 2.447398549802824, + "grad_norm": 1.5613986253738403, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8857640027999878, + "num_tokens": 734003669.0, + "step": 19239 + }, + { + "epoch": 2.4475257600814144, + "grad_norm": 1.7096880674362183, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8878676295280457, + "num_tokens": 734035501.0, + "step": 19240 + }, + { + "epoch": 2.447652970360005, + "grad_norm": 1.622487187385559, + "learning_rate": 1e-06, + "loss": 0.2583, + "mean_token_accuracy": 0.9058830738067627, + "num_tokens": 734066793.0, + "step": 19241 + }, + { + "epoch": 2.4477801806385955, + "grad_norm": 1.5925699472427368, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8764733672142029, + "num_tokens": 734108877.0, + "step": 19242 + }, + { + "epoch": 2.447907390917186, + "grad_norm": 1.499902606010437, + "learning_rate": 1e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.8972499370574951, + "num_tokens": 734145260.0, + "step": 19243 + }, + { + "epoch": 2.4480346011957765, + "grad_norm": 1.496556282043457, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8860871195793152, + "num_tokens": 734185915.0, + "step": 19244 + }, + { + "epoch": 2.448161811474367, + "grad_norm": 1.5892446041107178, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8913158774375916, + "num_tokens": 734222425.0, + "step": 19245 + }, + { + "epoch": 2.4482890217529576, + "grad_norm": 1.6353254318237305, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8708412051200867, + "num_tokens": 734258146.0, + "step": 19246 + }, + { + "epoch": 2.448416232031548, + "grad_norm": 1.6026532649993896, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8689230680465698, + "num_tokens": 734295914.0, + "step": 19247 + }, + { + "epoch": 2.4485434423101387, + "grad_norm": 1.5949459075927734, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8885615468025208, + "num_tokens": 734333828.0, + "step": 19248 + }, + { + "epoch": 2.448670652588729, + "grad_norm": 1.6435906887054443, + "learning_rate": 1e-06, + "loss": 0.2867, + "mean_token_accuracy": 0.8975962400436401, + "num_tokens": 734367216.0, + "step": 19249 + }, + { + "epoch": 2.4487978628673197, + "grad_norm": 1.476218819618225, + "learning_rate": 1e-06, + "loss": 0.295, + "mean_token_accuracy": 0.8932638168334961, + "num_tokens": 734407662.0, + "step": 19250 + }, + { + "epoch": 2.4489250731459102, + "grad_norm": 1.5240328311920166, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8931354284286499, + "num_tokens": 734446191.0, + "step": 19251 + }, + { + "epoch": 2.4490522834245008, + "grad_norm": 1.5216325521469116, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8910393118858337, + "num_tokens": 734484473.0, + "step": 19252 + }, + { + "epoch": 2.4491794937030913, + "grad_norm": 1.6211360692977905, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8677551746368408, + "num_tokens": 734527127.0, + "step": 19253 + }, + { + "epoch": 2.449306703981682, + "grad_norm": 1.5104655027389526, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8832000494003296, + "num_tokens": 734568786.0, + "step": 19254 + }, + { + "epoch": 2.4494339142602723, + "grad_norm": 1.4479700326919556, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.886149525642395, + "num_tokens": 734610718.0, + "step": 19255 + }, + { + "epoch": 2.449561124538863, + "grad_norm": 1.5370134115219116, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8924057483673096, + "num_tokens": 734649148.0, + "step": 19256 + }, + { + "epoch": 2.4496883348174534, + "grad_norm": 1.4463237524032593, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8973767757415771, + "num_tokens": 734691493.0, + "step": 19257 + }, + { + "epoch": 2.449815545096044, + "grad_norm": 1.5068128108978271, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8876321315765381, + "num_tokens": 734728343.0, + "step": 19258 + }, + { + "epoch": 2.4499427553746345, + "grad_norm": 1.6590272188186646, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8821613788604736, + "num_tokens": 734763955.0, + "step": 19259 + }, + { + "epoch": 2.450069965653225, + "grad_norm": 1.6941049098968506, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8754210472106934, + "num_tokens": 734799629.0, + "step": 19260 + }, + { + "epoch": 2.450197175931815, + "grad_norm": 1.5316544771194458, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8884884715080261, + "num_tokens": 734839474.0, + "step": 19261 + }, + { + "epoch": 2.450324386210406, + "grad_norm": 1.6531193256378174, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8848934173583984, + "num_tokens": 734873489.0, + "step": 19262 + }, + { + "epoch": 2.450451596488996, + "grad_norm": 1.573287844657898, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8776697516441345, + "num_tokens": 734915677.0, + "step": 19263 + }, + { + "epoch": 2.4505788067675867, + "grad_norm": 1.6484652757644653, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8930030465126038, + "num_tokens": 734947672.0, + "step": 19264 + }, + { + "epoch": 2.450706017046177, + "grad_norm": 1.4749815464019775, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8925058841705322, + "num_tokens": 734988882.0, + "step": 19265 + }, + { + "epoch": 2.4508332273247677, + "grad_norm": 1.5393784046173096, + "learning_rate": 1e-06, + "loss": 0.287, + "mean_token_accuracy": 0.897768497467041, + "num_tokens": 735023952.0, + "step": 19266 + }, + { + "epoch": 2.4509604376033582, + "grad_norm": 1.5651017427444458, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8817853927612305, + "num_tokens": 735059194.0, + "step": 19267 + }, + { + "epoch": 2.4510876478819488, + "grad_norm": 1.5686944723129272, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8882393836975098, + "num_tokens": 735096421.0, + "step": 19268 + }, + { + "epoch": 2.4512148581605393, + "grad_norm": 1.450191855430603, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8834453225135803, + "num_tokens": 735140783.0, + "step": 19269 + }, + { + "epoch": 2.45134206843913, + "grad_norm": 1.7949259281158447, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8744630813598633, + "num_tokens": 735173678.0, + "step": 19270 + }, + { + "epoch": 2.4514692787177204, + "grad_norm": 1.5292901992797852, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8912317156791687, + "num_tokens": 735208537.0, + "step": 19271 + }, + { + "epoch": 2.451596488996311, + "grad_norm": 1.5760215520858765, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8885658979415894, + "num_tokens": 735245129.0, + "step": 19272 + }, + { + "epoch": 2.4517236992749014, + "grad_norm": 1.4547410011291504, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8879706263542175, + "num_tokens": 735287357.0, + "step": 19273 + }, + { + "epoch": 2.451850909553492, + "grad_norm": 1.500251293182373, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8921849131584167, + "num_tokens": 735327153.0, + "step": 19274 + }, + { + "epoch": 2.4519781198320825, + "grad_norm": 1.6343562602996826, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8716108798980713, + "num_tokens": 735368507.0, + "step": 19275 + }, + { + "epoch": 2.452105330110673, + "grad_norm": 1.5019131898880005, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8852982521057129, + "num_tokens": 735410720.0, + "step": 19276 + }, + { + "epoch": 2.4522325403892635, + "grad_norm": 1.552965521812439, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8852304220199585, + "num_tokens": 735447689.0, + "step": 19277 + }, + { + "epoch": 2.452359750667854, + "grad_norm": 1.674669623374939, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8811718225479126, + "num_tokens": 735481161.0, + "step": 19278 + }, + { + "epoch": 2.4524869609464446, + "grad_norm": 1.4392971992492676, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.883851170539856, + "num_tokens": 735525508.0, + "step": 19279 + }, + { + "epoch": 2.452614171225035, + "grad_norm": 1.6170406341552734, + "learning_rate": 1e-06, + "loss": 0.2646, + "mean_token_accuracy": 0.9007501006126404, + "num_tokens": 735555391.0, + "step": 19280 + }, + { + "epoch": 2.4527413815036256, + "grad_norm": 1.48063063621521, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8831144571304321, + "num_tokens": 735594295.0, + "step": 19281 + }, + { + "epoch": 2.452868591782216, + "grad_norm": 1.4452929496765137, + "learning_rate": 1e-06, + "loss": 0.277, + "mean_token_accuracy": 0.8979745507240295, + "num_tokens": 735632667.0, + "step": 19282 + }, + { + "epoch": 2.4529958020608067, + "grad_norm": 1.6838098764419556, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.885475754737854, + "num_tokens": 735663307.0, + "step": 19283 + }, + { + "epoch": 2.453123012339397, + "grad_norm": 1.5503238439559937, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8801284432411194, + "num_tokens": 735703379.0, + "step": 19284 + }, + { + "epoch": 2.4532502226179878, + "grad_norm": 1.4614763259887695, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.883464515209198, + "num_tokens": 735744223.0, + "step": 19285 + }, + { + "epoch": 2.453377432896578, + "grad_norm": 1.4774936437606812, + "learning_rate": 1e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.8976518511772156, + "num_tokens": 735784472.0, + "step": 19286 + }, + { + "epoch": 2.4535046431751684, + "grad_norm": 1.5477733612060547, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8697724938392639, + "num_tokens": 735825417.0, + "step": 19287 + }, + { + "epoch": 2.453631853453759, + "grad_norm": 1.5691983699798584, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8898414969444275, + "num_tokens": 735864687.0, + "step": 19288 + }, + { + "epoch": 2.4537590637323494, + "grad_norm": 1.5022523403167725, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8774493932723999, + "num_tokens": 735904102.0, + "step": 19289 + }, + { + "epoch": 2.45388627401094, + "grad_norm": 1.478247880935669, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8934404253959656, + "num_tokens": 735945551.0, + "step": 19290 + }, + { + "epoch": 2.4540134842895305, + "grad_norm": 1.6108746528625488, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.87727290391922, + "num_tokens": 735984124.0, + "step": 19291 + }, + { + "epoch": 2.454140694568121, + "grad_norm": 1.5285898447036743, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8906353712081909, + "num_tokens": 736023338.0, + "step": 19292 + }, + { + "epoch": 2.4542679048467115, + "grad_norm": 1.6635364294052124, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8815444707870483, + "num_tokens": 736055689.0, + "step": 19293 + }, + { + "epoch": 2.454395115125302, + "grad_norm": 1.6410486698150635, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8830224275588989, + "num_tokens": 736090015.0, + "step": 19294 + }, + { + "epoch": 2.4545223254038926, + "grad_norm": 1.5532523393630981, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8920993804931641, + "num_tokens": 736127428.0, + "step": 19295 + }, + { + "epoch": 2.454649535682483, + "grad_norm": 1.5907787084579468, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8666765093803406, + "num_tokens": 736168012.0, + "step": 19296 + }, + { + "epoch": 2.4547767459610736, + "grad_norm": 1.649025797843933, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8599390387535095, + "num_tokens": 736205387.0, + "step": 19297 + }, + { + "epoch": 2.454903956239664, + "grad_norm": 1.563175916671753, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.885270357131958, + "num_tokens": 736242177.0, + "step": 19298 + }, + { + "epoch": 2.4550311665182547, + "grad_norm": 1.5325183868408203, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.8946398496627808, + "num_tokens": 736278868.0, + "step": 19299 + }, + { + "epoch": 2.4551583767968452, + "grad_norm": 1.5915101766586304, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8904256820678711, + "num_tokens": 736315004.0, + "step": 19300 + }, + { + "epoch": 2.4552855870754358, + "grad_norm": 1.6815390586853027, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8878260850906372, + "num_tokens": 736356680.0, + "step": 19301 + }, + { + "epoch": 2.4554127973540263, + "grad_norm": 1.7099244594573975, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8783196210861206, + "num_tokens": 736392924.0, + "step": 19302 + }, + { + "epoch": 2.455540007632617, + "grad_norm": 1.5575135946273804, + "learning_rate": 1e-06, + "loss": 0.2751, + "mean_token_accuracy": 0.8995380997657776, + "num_tokens": 736429818.0, + "step": 19303 + }, + { + "epoch": 2.4556672179112073, + "grad_norm": 1.6142102479934692, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.868549108505249, + "num_tokens": 736469092.0, + "step": 19304 + }, + { + "epoch": 2.455794428189798, + "grad_norm": 1.6621803045272827, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.883069634437561, + "num_tokens": 736506542.0, + "step": 19305 + }, + { + "epoch": 2.4559216384683884, + "grad_norm": 1.542328119277954, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8814281225204468, + "num_tokens": 736546300.0, + "step": 19306 + }, + { + "epoch": 2.456048848746979, + "grad_norm": 1.5595241785049438, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8872309923171997, + "num_tokens": 736585292.0, + "step": 19307 + }, + { + "epoch": 2.4561760590255695, + "grad_norm": 1.4617373943328857, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.886374831199646, + "num_tokens": 736626730.0, + "step": 19308 + }, + { + "epoch": 2.4563032693041595, + "grad_norm": 1.5729637145996094, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8866536617279053, + "num_tokens": 736663794.0, + "step": 19309 + }, + { + "epoch": 2.4564304795827505, + "grad_norm": 1.6595336198806763, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8804086446762085, + "num_tokens": 736698358.0, + "step": 19310 + }, + { + "epoch": 2.4565576898613406, + "grad_norm": 1.700093388557434, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8728804588317871, + "num_tokens": 736735579.0, + "step": 19311 + }, + { + "epoch": 2.456684900139931, + "grad_norm": 1.6858322620391846, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8827569484710693, + "num_tokens": 736772856.0, + "step": 19312 + }, + { + "epoch": 2.4568121104185217, + "grad_norm": 1.6508532762527466, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.880616307258606, + "num_tokens": 736809794.0, + "step": 19313 + }, + { + "epoch": 2.456939320697112, + "grad_norm": 1.534282922744751, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8944147825241089, + "num_tokens": 736848502.0, + "step": 19314 + }, + { + "epoch": 2.4570665309757027, + "grad_norm": 1.4450838565826416, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8894412517547607, + "num_tokens": 736890998.0, + "step": 19315 + }, + { + "epoch": 2.4571937412542932, + "grad_norm": 1.4624601602554321, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.8986287117004395, + "num_tokens": 736930839.0, + "step": 19316 + }, + { + "epoch": 2.4573209515328838, + "grad_norm": 1.5741409063339233, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.894230842590332, + "num_tokens": 736964721.0, + "step": 19317 + }, + { + "epoch": 2.4574481618114743, + "grad_norm": 1.493382453918457, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.894698441028595, + "num_tokens": 737005907.0, + "step": 19318 + }, + { + "epoch": 2.457575372090065, + "grad_norm": 1.4387238025665283, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8944079875946045, + "num_tokens": 737047940.0, + "step": 19319 + }, + { + "epoch": 2.4577025823686554, + "grad_norm": 1.446061611175537, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8903850317001343, + "num_tokens": 737088894.0, + "step": 19320 + }, + { + "epoch": 2.457829792647246, + "grad_norm": 1.4531644582748413, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8837677240371704, + "num_tokens": 737133128.0, + "step": 19321 + }, + { + "epoch": 2.4579570029258364, + "grad_norm": 1.5892688035964966, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8856610655784607, + "num_tokens": 737164542.0, + "step": 19322 + }, + { + "epoch": 2.458084213204427, + "grad_norm": 1.6799436807632446, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8844379186630249, + "num_tokens": 737198804.0, + "step": 19323 + }, + { + "epoch": 2.4582114234830175, + "grad_norm": 1.441024661064148, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8936349153518677, + "num_tokens": 737241330.0, + "step": 19324 + }, + { + "epoch": 2.458338633761608, + "grad_norm": 1.6836880445480347, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8777774572372437, + "num_tokens": 737279144.0, + "step": 19325 + }, + { + "epoch": 2.4584658440401985, + "grad_norm": 1.8059370517730713, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8700315952301025, + "num_tokens": 737312808.0, + "step": 19326 + }, + { + "epoch": 2.458593054318789, + "grad_norm": 1.5135023593902588, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8787814378738403, + "num_tokens": 737357034.0, + "step": 19327 + }, + { + "epoch": 2.4587202645973796, + "grad_norm": 1.577409267425537, + "learning_rate": 1e-06, + "loss": 0.2809, + "mean_token_accuracy": 0.8990093469619751, + "num_tokens": 737393966.0, + "step": 19328 + }, + { + "epoch": 2.45884747487597, + "grad_norm": 1.6252278089523315, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.885986328125, + "num_tokens": 737432711.0, + "step": 19329 + }, + { + "epoch": 2.4589746851545606, + "grad_norm": 1.5739277601242065, + "learning_rate": 1e-06, + "loss": 0.2525, + "mean_token_accuracy": 0.9063390493392944, + "num_tokens": 737466688.0, + "step": 19330 + }, + { + "epoch": 2.459101895433151, + "grad_norm": 1.5755470991134644, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8900695443153381, + "num_tokens": 737500315.0, + "step": 19331 + }, + { + "epoch": 2.4592291057117417, + "grad_norm": 1.488872766494751, + "learning_rate": 1e-06, + "loss": 0.2791, + "mean_token_accuracy": 0.9009169936180115, + "num_tokens": 737535226.0, + "step": 19332 + }, + { + "epoch": 2.459356315990332, + "grad_norm": 1.4814949035644531, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8884328603744507, + "num_tokens": 737574822.0, + "step": 19333 + }, + { + "epoch": 2.4594835262689223, + "grad_norm": 1.5554758310317993, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8902864456176758, + "num_tokens": 737612286.0, + "step": 19334 + }, + { + "epoch": 2.4596107365475133, + "grad_norm": 1.5660921335220337, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8848912715911865, + "num_tokens": 737648674.0, + "step": 19335 + }, + { + "epoch": 2.4597379468261034, + "grad_norm": 1.578850507736206, + "learning_rate": 1e-06, + "loss": 0.2765, + "mean_token_accuracy": 0.9021971225738525, + "num_tokens": 737684554.0, + "step": 19336 + }, + { + "epoch": 2.459865157104694, + "grad_norm": 1.580694317817688, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8866881132125854, + "num_tokens": 737721767.0, + "step": 19337 + }, + { + "epoch": 2.4599923673832844, + "grad_norm": 1.6978577375411987, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8897167444229126, + "num_tokens": 737753908.0, + "step": 19338 + }, + { + "epoch": 2.460119577661875, + "grad_norm": 1.7326128482818604, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.883098840713501, + "num_tokens": 737786935.0, + "step": 19339 + }, + { + "epoch": 2.4602467879404655, + "grad_norm": 1.5475233793258667, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8896197080612183, + "num_tokens": 737824855.0, + "step": 19340 + }, + { + "epoch": 2.460373998219056, + "grad_norm": 1.5956710577011108, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8889570236206055, + "num_tokens": 737859904.0, + "step": 19341 + }, + { + "epoch": 2.4605012084976465, + "grad_norm": 1.496600866317749, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8802236318588257, + "num_tokens": 737898374.0, + "step": 19342 + }, + { + "epoch": 2.460628418776237, + "grad_norm": 1.592448353767395, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8903184533119202, + "num_tokens": 737934579.0, + "step": 19343 + }, + { + "epoch": 2.4607556290548276, + "grad_norm": 1.5433433055877686, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8964636325836182, + "num_tokens": 737973360.0, + "step": 19344 + }, + { + "epoch": 2.460882839333418, + "grad_norm": 1.5395734310150146, + "learning_rate": 1e-06, + "loss": 0.2846, + "mean_token_accuracy": 0.8991000652313232, + "num_tokens": 738012479.0, + "step": 19345 + }, + { + "epoch": 2.4610100496120086, + "grad_norm": 1.6071953773498535, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.878105640411377, + "num_tokens": 738050211.0, + "step": 19346 + }, + { + "epoch": 2.461137259890599, + "grad_norm": 1.5445234775543213, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8905067443847656, + "num_tokens": 738086304.0, + "step": 19347 + }, + { + "epoch": 2.4612644701691897, + "grad_norm": 1.6466232538223267, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.889645516872406, + "num_tokens": 738121306.0, + "step": 19348 + }, + { + "epoch": 2.4613916804477802, + "grad_norm": 1.514220118522644, + "learning_rate": 1e-06, + "loss": 0.2731, + "mean_token_accuracy": 0.9000546336174011, + "num_tokens": 738155592.0, + "step": 19349 + }, + { + "epoch": 2.4615188907263708, + "grad_norm": 1.6342781782150269, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8750811815261841, + "num_tokens": 738193174.0, + "step": 19350 + }, + { + "epoch": 2.4616461010049613, + "grad_norm": 1.4089959859848022, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.891545295715332, + "num_tokens": 738235941.0, + "step": 19351 + }, + { + "epoch": 2.461773311283552, + "grad_norm": 1.4127709865570068, + "learning_rate": 1e-06, + "loss": 0.2756, + "mean_token_accuracy": 0.8987960815429688, + "num_tokens": 738275973.0, + "step": 19352 + }, + { + "epoch": 2.4619005215621423, + "grad_norm": 1.5902281999588013, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8865607976913452, + "num_tokens": 738311624.0, + "step": 19353 + }, + { + "epoch": 2.462027731840733, + "grad_norm": 1.4850375652313232, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8854049444198608, + "num_tokens": 738353099.0, + "step": 19354 + }, + { + "epoch": 2.4621549421193234, + "grad_norm": 1.5782644748687744, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8788397908210754, + "num_tokens": 738394600.0, + "step": 19355 + }, + { + "epoch": 2.462282152397914, + "grad_norm": 1.5704299211502075, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8800283074378967, + "num_tokens": 738432407.0, + "step": 19356 + }, + { + "epoch": 2.4624093626765045, + "grad_norm": 1.574187994003296, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8858233690261841, + "num_tokens": 738469069.0, + "step": 19357 + }, + { + "epoch": 2.462536572955095, + "grad_norm": 1.5057902336120605, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8859326243400574, + "num_tokens": 738507515.0, + "step": 19358 + }, + { + "epoch": 2.462663783233685, + "grad_norm": 1.5275743007659912, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8899564743041992, + "num_tokens": 738543470.0, + "step": 19359 + }, + { + "epoch": 2.462790993512276, + "grad_norm": 1.5366662740707397, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8843923807144165, + "num_tokens": 738585511.0, + "step": 19360 + }, + { + "epoch": 2.462918203790866, + "grad_norm": 1.655612587928772, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8847318887710571, + "num_tokens": 738623286.0, + "step": 19361 + }, + { + "epoch": 2.4630454140694567, + "grad_norm": 1.6840789318084717, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.876300573348999, + "num_tokens": 738657743.0, + "step": 19362 + }, + { + "epoch": 2.463172624348047, + "grad_norm": 1.626761794090271, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8781388998031616, + "num_tokens": 738692328.0, + "step": 19363 + }, + { + "epoch": 2.4632998346266377, + "grad_norm": 1.435489296913147, + "learning_rate": 1e-06, + "loss": 0.2615, + "mean_token_accuracy": 0.9038676619529724, + "num_tokens": 738729408.0, + "step": 19364 + }, + { + "epoch": 2.4634270449052282, + "grad_norm": 1.5495810508728027, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8802609443664551, + "num_tokens": 738768073.0, + "step": 19365 + }, + { + "epoch": 2.4635542551838188, + "grad_norm": 1.7065564393997192, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8803138136863708, + "num_tokens": 738802029.0, + "step": 19366 + }, + { + "epoch": 2.4636814654624093, + "grad_norm": 1.650753378868103, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8748475313186646, + "num_tokens": 738840250.0, + "step": 19367 + }, + { + "epoch": 2.463808675741, + "grad_norm": 1.5175729990005493, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8820476531982422, + "num_tokens": 738882467.0, + "step": 19368 + }, + { + "epoch": 2.4639358860195903, + "grad_norm": 1.5584609508514404, + "learning_rate": 1e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.8995373249053955, + "num_tokens": 738918260.0, + "step": 19369 + }, + { + "epoch": 2.464063096298181, + "grad_norm": 1.7810022830963135, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8662202954292297, + "num_tokens": 738956359.0, + "step": 19370 + }, + { + "epoch": 2.4641903065767714, + "grad_norm": 1.5036392211914062, + "learning_rate": 1e-06, + "loss": 0.2748, + "mean_token_accuracy": 0.8995327949523926, + "num_tokens": 738992135.0, + "step": 19371 + }, + { + "epoch": 2.464317516855362, + "grad_norm": 1.48664128780365, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8897400498390198, + "num_tokens": 739036015.0, + "step": 19372 + }, + { + "epoch": 2.4644447271339525, + "grad_norm": 1.4384838342666626, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8936774730682373, + "num_tokens": 739078412.0, + "step": 19373 + }, + { + "epoch": 2.464571937412543, + "grad_norm": 1.661906123161316, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8929758667945862, + "num_tokens": 739114100.0, + "step": 19374 + }, + { + "epoch": 2.4646991476911335, + "grad_norm": 1.5658999681472778, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8832473158836365, + "num_tokens": 739152396.0, + "step": 19375 + }, + { + "epoch": 2.464826357969724, + "grad_norm": 1.5764594078063965, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8806389570236206, + "num_tokens": 739190457.0, + "step": 19376 + }, + { + "epoch": 2.4649535682483146, + "grad_norm": 1.5393813848495483, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8855149745941162, + "num_tokens": 739229078.0, + "step": 19377 + }, + { + "epoch": 2.465080778526905, + "grad_norm": 1.5786606073379517, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8915548324584961, + "num_tokens": 739263245.0, + "step": 19378 + }, + { + "epoch": 2.4652079888054956, + "grad_norm": 1.611104130744934, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8842617273330688, + "num_tokens": 739303771.0, + "step": 19379 + }, + { + "epoch": 2.465335199084086, + "grad_norm": 1.7370057106018066, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8829747438430786, + "num_tokens": 739335388.0, + "step": 19380 + }, + { + "epoch": 2.4654624093626767, + "grad_norm": 1.5687838792800903, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8778335452079773, + "num_tokens": 739375890.0, + "step": 19381 + }, + { + "epoch": 2.4655896196412668, + "grad_norm": 1.4583635330200195, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8898533582687378, + "num_tokens": 739416264.0, + "step": 19382 + }, + { + "epoch": 2.4657168299198577, + "grad_norm": 1.6625440120697021, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8729334473609924, + "num_tokens": 739454600.0, + "step": 19383 + }, + { + "epoch": 2.465844040198448, + "grad_norm": 1.8095357418060303, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8721370697021484, + "num_tokens": 739487397.0, + "step": 19384 + }, + { + "epoch": 2.4659712504770384, + "grad_norm": 1.5344384908676147, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8813034296035767, + "num_tokens": 739527434.0, + "step": 19385 + }, + { + "epoch": 2.466098460755629, + "grad_norm": 1.4378689527511597, + "learning_rate": 1e-06, + "loss": 0.2792, + "mean_token_accuracy": 0.9004051685333252, + "num_tokens": 739567090.0, + "step": 19386 + }, + { + "epoch": 2.4662256710342194, + "grad_norm": 1.517571210861206, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8854368329048157, + "num_tokens": 739606096.0, + "step": 19387 + }, + { + "epoch": 2.46635288131281, + "grad_norm": 1.5272653102874756, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8910057544708252, + "num_tokens": 739641779.0, + "step": 19388 + }, + { + "epoch": 2.4664800915914005, + "grad_norm": 1.5894900560379028, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8751828670501709, + "num_tokens": 739681991.0, + "step": 19389 + }, + { + "epoch": 2.466607301869991, + "grad_norm": 1.5729293823242188, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8928282856941223, + "num_tokens": 739720756.0, + "step": 19390 + }, + { + "epoch": 2.4667345121485815, + "grad_norm": 1.5467196702957153, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8848943710327148, + "num_tokens": 739761003.0, + "step": 19391 + }, + { + "epoch": 2.466861722427172, + "grad_norm": 1.5478687286376953, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8832714557647705, + "num_tokens": 739800953.0, + "step": 19392 + }, + { + "epoch": 2.4669889327057626, + "grad_norm": 1.491475224494934, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8960676193237305, + "num_tokens": 739840357.0, + "step": 19393 + }, + { + "epoch": 2.467116142984353, + "grad_norm": 1.5784077644348145, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.8937826752662659, + "num_tokens": 739874367.0, + "step": 19394 + }, + { + "epoch": 2.4672433532629436, + "grad_norm": 1.8508565425872803, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.878515362739563, + "num_tokens": 739909281.0, + "step": 19395 + }, + { + "epoch": 2.467370563541534, + "grad_norm": 1.4341387748718262, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8835479617118835, + "num_tokens": 739952533.0, + "step": 19396 + }, + { + "epoch": 2.4674977738201247, + "grad_norm": 1.6248255968093872, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8899250030517578, + "num_tokens": 739985854.0, + "step": 19397 + }, + { + "epoch": 2.4676249840987152, + "grad_norm": 1.6241191625595093, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8682996034622192, + "num_tokens": 740023247.0, + "step": 19398 + }, + { + "epoch": 2.4677521943773058, + "grad_norm": 1.4064007997512817, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8783319592475891, + "num_tokens": 740069153.0, + "step": 19399 + }, + { + "epoch": 2.4678794046558963, + "grad_norm": 1.7446352243423462, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8928956985473633, + "num_tokens": 740100126.0, + "step": 19400 + }, + { + "epoch": 2.468006614934487, + "grad_norm": 1.5589431524276733, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8822879791259766, + "num_tokens": 740140534.0, + "step": 19401 + }, + { + "epoch": 2.4681338252130773, + "grad_norm": 1.5403518676757812, + "learning_rate": 1e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.892716646194458, + "num_tokens": 740177319.0, + "step": 19402 + }, + { + "epoch": 2.468261035491668, + "grad_norm": 1.4910335540771484, + "learning_rate": 1e-06, + "loss": 0.2676, + "mean_token_accuracy": 0.9035422801971436, + "num_tokens": 740213171.0, + "step": 19403 + }, + { + "epoch": 2.4683882457702584, + "grad_norm": 1.5141823291778564, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8889615535736084, + "num_tokens": 740254051.0, + "step": 19404 + }, + { + "epoch": 2.468515456048849, + "grad_norm": 1.4654139280319214, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8959161043167114, + "num_tokens": 740294129.0, + "step": 19405 + }, + { + "epoch": 2.4686426663274394, + "grad_norm": 1.4831920862197876, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8780930042266846, + "num_tokens": 740337172.0, + "step": 19406 + }, + { + "epoch": 2.4687698766060295, + "grad_norm": 1.6611899137496948, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8814480304718018, + "num_tokens": 740373118.0, + "step": 19407 + }, + { + "epoch": 2.4688970868846205, + "grad_norm": 1.5789554119110107, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8666878342628479, + "num_tokens": 740416016.0, + "step": 19408 + }, + { + "epoch": 2.4690242971632106, + "grad_norm": 1.6014306545257568, + "learning_rate": 1e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.8941649794578552, + "num_tokens": 740453068.0, + "step": 19409 + }, + { + "epoch": 2.469151507441801, + "grad_norm": 1.5430402755737305, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8882676362991333, + "num_tokens": 740493806.0, + "step": 19410 + }, + { + "epoch": 2.4692787177203916, + "grad_norm": 1.6704812049865723, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8898859024047852, + "num_tokens": 740528380.0, + "step": 19411 + }, + { + "epoch": 2.469405927998982, + "grad_norm": 1.6001505851745605, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8846104741096497, + "num_tokens": 740564677.0, + "step": 19412 + }, + { + "epoch": 2.4695331382775727, + "grad_norm": 1.620572566986084, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8895304799079895, + "num_tokens": 740601303.0, + "step": 19413 + }, + { + "epoch": 2.4696603485561632, + "grad_norm": 1.5480068922042847, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8759939670562744, + "num_tokens": 740642651.0, + "step": 19414 + }, + { + "epoch": 2.4697875588347538, + "grad_norm": 1.4331036806106567, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8833672404289246, + "num_tokens": 740687731.0, + "step": 19415 + }, + { + "epoch": 2.4699147691133443, + "grad_norm": 1.6059650182724, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.887762188911438, + "num_tokens": 740722310.0, + "step": 19416 + }, + { + "epoch": 2.470041979391935, + "grad_norm": 1.9061766862869263, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8789307475090027, + "num_tokens": 740752001.0, + "step": 19417 + }, + { + "epoch": 2.4701691896705253, + "grad_norm": 1.637487530708313, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8849165439605713, + "num_tokens": 740783907.0, + "step": 19418 + }, + { + "epoch": 2.470296399949116, + "grad_norm": 1.4590333700180054, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8799495100975037, + "num_tokens": 740823625.0, + "step": 19419 + }, + { + "epoch": 2.4704236102277064, + "grad_norm": 1.5010534524917603, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8915850520133972, + "num_tokens": 740863214.0, + "step": 19420 + }, + { + "epoch": 2.470550820506297, + "grad_norm": 1.754894733428955, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8707518577575684, + "num_tokens": 740900079.0, + "step": 19421 + }, + { + "epoch": 2.4706780307848875, + "grad_norm": 1.5240073204040527, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8762305974960327, + "num_tokens": 740939822.0, + "step": 19422 + }, + { + "epoch": 2.470805241063478, + "grad_norm": 1.5354394912719727, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.890507698059082, + "num_tokens": 740978726.0, + "step": 19423 + }, + { + "epoch": 2.4709324513420685, + "grad_norm": 1.5562207698822021, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8942262530326843, + "num_tokens": 741015068.0, + "step": 19424 + }, + { + "epoch": 2.471059661620659, + "grad_norm": 1.636388897895813, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8879022598266602, + "num_tokens": 741050146.0, + "step": 19425 + }, + { + "epoch": 2.4711868718992496, + "grad_norm": 1.4818053245544434, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8794729709625244, + "num_tokens": 741093327.0, + "step": 19426 + }, + { + "epoch": 2.47131408217784, + "grad_norm": 1.577121376991272, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8870167136192322, + "num_tokens": 741133170.0, + "step": 19427 + }, + { + "epoch": 2.4714412924564306, + "grad_norm": 1.538923740386963, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8909894227981567, + "num_tokens": 741173110.0, + "step": 19428 + }, + { + "epoch": 2.471568502735021, + "grad_norm": 1.6217933893203735, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8781415224075317, + "num_tokens": 741213784.0, + "step": 19429 + }, + { + "epoch": 2.4716957130136117, + "grad_norm": 1.5427247285842896, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8887755870819092, + "num_tokens": 741254836.0, + "step": 19430 + }, + { + "epoch": 2.471822923292202, + "grad_norm": 1.5789912939071655, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8837190866470337, + "num_tokens": 741293322.0, + "step": 19431 + }, + { + "epoch": 2.4719501335707923, + "grad_norm": 1.7019004821777344, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8764299750328064, + "num_tokens": 741330185.0, + "step": 19432 + }, + { + "epoch": 2.4720773438493833, + "grad_norm": 1.3985111713409424, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8888074159622192, + "num_tokens": 741374178.0, + "step": 19433 + }, + { + "epoch": 2.4722045541279734, + "grad_norm": 1.4758800268173218, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8655101656913757, + "num_tokens": 741418379.0, + "step": 19434 + }, + { + "epoch": 2.472331764406564, + "grad_norm": 1.437025547027588, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8817234039306641, + "num_tokens": 741459886.0, + "step": 19435 + }, + { + "epoch": 2.4724589746851544, + "grad_norm": 1.6506308317184448, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8853743076324463, + "num_tokens": 741496202.0, + "step": 19436 + }, + { + "epoch": 2.472586184963745, + "grad_norm": 1.5371153354644775, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8767930269241333, + "num_tokens": 741536819.0, + "step": 19437 + }, + { + "epoch": 2.4727133952423355, + "grad_norm": 1.5128473043441772, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8812742233276367, + "num_tokens": 741578705.0, + "step": 19438 + }, + { + "epoch": 2.472840605520926, + "grad_norm": 1.4950315952301025, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8734647035598755, + "num_tokens": 741620140.0, + "step": 19439 + }, + { + "epoch": 2.4729678157995165, + "grad_norm": 1.4362469911575317, + "learning_rate": 1e-06, + "loss": 0.2798, + "mean_token_accuracy": 0.8985507488250732, + "num_tokens": 741662544.0, + "step": 19440 + }, + { + "epoch": 2.473095026078107, + "grad_norm": 1.6260361671447754, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8792144060134888, + "num_tokens": 741704458.0, + "step": 19441 + }, + { + "epoch": 2.4732222363566976, + "grad_norm": 1.6100798845291138, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8963602185249329, + "num_tokens": 741741510.0, + "step": 19442 + }, + { + "epoch": 2.473349446635288, + "grad_norm": 1.7093863487243652, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8888789415359497, + "num_tokens": 741772445.0, + "step": 19443 + }, + { + "epoch": 2.4734766569138786, + "grad_norm": 1.5058608055114746, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8843072056770325, + "num_tokens": 741810521.0, + "step": 19444 + }, + { + "epoch": 2.473603867192469, + "grad_norm": 1.4268463850021362, + "learning_rate": 1e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.8985065817832947, + "num_tokens": 741851409.0, + "step": 19445 + }, + { + "epoch": 2.4737310774710597, + "grad_norm": 1.4554333686828613, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8884987235069275, + "num_tokens": 741894955.0, + "step": 19446 + }, + { + "epoch": 2.47385828774965, + "grad_norm": 1.4862691164016724, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8928846120834351, + "num_tokens": 741932622.0, + "step": 19447 + }, + { + "epoch": 2.4739854980282407, + "grad_norm": 1.8197674751281738, + "learning_rate": 1e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.8957788944244385, + "num_tokens": 741962187.0, + "step": 19448 + }, + { + "epoch": 2.4741127083068313, + "grad_norm": 1.5162051916122437, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8848772644996643, + "num_tokens": 742003073.0, + "step": 19449 + }, + { + "epoch": 2.474239918585422, + "grad_norm": 1.6510480642318726, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8740656971931458, + "num_tokens": 742041898.0, + "step": 19450 + }, + { + "epoch": 2.4743671288640123, + "grad_norm": 1.5134365558624268, + "learning_rate": 1e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.896737277507782, + "num_tokens": 742081302.0, + "step": 19451 + }, + { + "epoch": 2.474494339142603, + "grad_norm": 1.636049509048462, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8905656337738037, + "num_tokens": 742114093.0, + "step": 19452 + }, + { + "epoch": 2.4746215494211934, + "grad_norm": 1.7144490480422974, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8815761208534241, + "num_tokens": 742148100.0, + "step": 19453 + }, + { + "epoch": 2.474748759699784, + "grad_norm": 1.4566175937652588, + "learning_rate": 1e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.8987276554107666, + "num_tokens": 742186232.0, + "step": 19454 + }, + { + "epoch": 2.4748759699783744, + "grad_norm": 1.7115968465805054, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8904631733894348, + "num_tokens": 742222923.0, + "step": 19455 + }, + { + "epoch": 2.475003180256965, + "grad_norm": 1.5034548044204712, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8909773826599121, + "num_tokens": 742262808.0, + "step": 19456 + }, + { + "epoch": 2.475130390535555, + "grad_norm": 1.684708595275879, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8961544036865234, + "num_tokens": 742297452.0, + "step": 19457 + }, + { + "epoch": 2.475257600814146, + "grad_norm": 1.655342698097229, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8789273500442505, + "num_tokens": 742336540.0, + "step": 19458 + }, + { + "epoch": 2.475384811092736, + "grad_norm": 1.5795687437057495, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.88908451795578, + "num_tokens": 742375090.0, + "step": 19459 + }, + { + "epoch": 2.4755120213713266, + "grad_norm": 1.4883167743682861, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8808314800262451, + "num_tokens": 742414591.0, + "step": 19460 + }, + { + "epoch": 2.475639231649917, + "grad_norm": 1.5989340543746948, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8803719282150269, + "num_tokens": 742456879.0, + "step": 19461 + }, + { + "epoch": 2.4757664419285077, + "grad_norm": 1.5618698596954346, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8857029676437378, + "num_tokens": 742496143.0, + "step": 19462 + }, + { + "epoch": 2.4758936522070982, + "grad_norm": 1.5195075273513794, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8922876119613647, + "num_tokens": 742538117.0, + "step": 19463 + }, + { + "epoch": 2.4760208624856888, + "grad_norm": 1.5290580987930298, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8837510943412781, + "num_tokens": 742575107.0, + "step": 19464 + }, + { + "epoch": 2.4761480727642793, + "grad_norm": 1.436691164970398, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8924633264541626, + "num_tokens": 742617446.0, + "step": 19465 + }, + { + "epoch": 2.47627528304287, + "grad_norm": 1.529516339302063, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.890516996383667, + "num_tokens": 742657966.0, + "step": 19466 + }, + { + "epoch": 2.4764024933214603, + "grad_norm": 1.4796063899993896, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8840463161468506, + "num_tokens": 742699393.0, + "step": 19467 + }, + { + "epoch": 2.476529703600051, + "grad_norm": 1.5444303750991821, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8912785053253174, + "num_tokens": 742738651.0, + "step": 19468 + }, + { + "epoch": 2.4766569138786414, + "grad_norm": 1.5077248811721802, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8857520818710327, + "num_tokens": 742779345.0, + "step": 19469 + }, + { + "epoch": 2.476784124157232, + "grad_norm": 1.6516811847686768, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8841266632080078, + "num_tokens": 742816911.0, + "step": 19470 + }, + { + "epoch": 2.4769113344358225, + "grad_norm": 1.7077395915985107, + "learning_rate": 1e-06, + "loss": 0.296, + "mean_token_accuracy": 0.8926917314529419, + "num_tokens": 742849639.0, + "step": 19471 + }, + { + "epoch": 2.477038544714413, + "grad_norm": 1.626954197883606, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8905184268951416, + "num_tokens": 742887229.0, + "step": 19472 + }, + { + "epoch": 2.4771657549930035, + "grad_norm": 1.5622444152832031, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8902782201766968, + "num_tokens": 742924606.0, + "step": 19473 + }, + { + "epoch": 2.477292965271594, + "grad_norm": 1.598997712135315, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8650949597358704, + "num_tokens": 742966374.0, + "step": 19474 + }, + { + "epoch": 2.4774201755501846, + "grad_norm": 1.477035403251648, + "learning_rate": 1e-06, + "loss": 0.2772, + "mean_token_accuracy": 0.8981127142906189, + "num_tokens": 743003760.0, + "step": 19475 + }, + { + "epoch": 2.477547385828775, + "grad_norm": 1.720000147819519, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8843256235122681, + "num_tokens": 743035220.0, + "step": 19476 + }, + { + "epoch": 2.4776745961073656, + "grad_norm": 1.490749478340149, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8874834775924683, + "num_tokens": 743078607.0, + "step": 19477 + }, + { + "epoch": 2.477801806385956, + "grad_norm": 1.5052372217178345, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8793207406997681, + "num_tokens": 743121786.0, + "step": 19478 + }, + { + "epoch": 2.4779290166645467, + "grad_norm": 1.7235170602798462, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8713037967681885, + "num_tokens": 743157030.0, + "step": 19479 + }, + { + "epoch": 2.4780562269431368, + "grad_norm": 1.6971317529678345, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8911697864532471, + "num_tokens": 743190023.0, + "step": 19480 + }, + { + "epoch": 2.4781834372217277, + "grad_norm": 1.6548528671264648, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8909217119216919, + "num_tokens": 743221786.0, + "step": 19481 + }, + { + "epoch": 2.478310647500318, + "grad_norm": 1.4834825992584229, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8845362663269043, + "num_tokens": 743263276.0, + "step": 19482 + }, + { + "epoch": 2.4784378577789083, + "grad_norm": 1.5034584999084473, + "learning_rate": 1e-06, + "loss": 0.2802, + "mean_token_accuracy": 0.8997758030891418, + "num_tokens": 743300447.0, + "step": 19483 + }, + { + "epoch": 2.478565068057499, + "grad_norm": 1.4722557067871094, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8956881165504456, + "num_tokens": 743342414.0, + "step": 19484 + }, + { + "epoch": 2.4786922783360894, + "grad_norm": 1.4825923442840576, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8891960978507996, + "num_tokens": 743382488.0, + "step": 19485 + }, + { + "epoch": 2.47881948861468, + "grad_norm": 1.5601311922073364, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8767814636230469, + "num_tokens": 743424638.0, + "step": 19486 + }, + { + "epoch": 2.4789466988932705, + "grad_norm": 1.5666905641555786, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.874534010887146, + "num_tokens": 743464760.0, + "step": 19487 + }, + { + "epoch": 2.479073909171861, + "grad_norm": 1.5335253477096558, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8815757036209106, + "num_tokens": 743504261.0, + "step": 19488 + }, + { + "epoch": 2.4792011194504515, + "grad_norm": 1.6818140745162964, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8829728364944458, + "num_tokens": 743538247.0, + "step": 19489 + }, + { + "epoch": 2.479328329729042, + "grad_norm": 1.615329623222351, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.891466498374939, + "num_tokens": 743571743.0, + "step": 19490 + }, + { + "epoch": 2.4794555400076326, + "grad_norm": 1.7105034589767456, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8921993970870972, + "num_tokens": 743605318.0, + "step": 19491 + }, + { + "epoch": 2.479582750286223, + "grad_norm": 1.499992847442627, + "learning_rate": 1e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.8975498676300049, + "num_tokens": 743642637.0, + "step": 19492 + }, + { + "epoch": 2.4797099605648136, + "grad_norm": 1.6506192684173584, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8792355060577393, + "num_tokens": 743678694.0, + "step": 19493 + }, + { + "epoch": 2.479837170843404, + "grad_norm": 1.4784231185913086, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8914660215377808, + "num_tokens": 743714926.0, + "step": 19494 + }, + { + "epoch": 2.4799643811219947, + "grad_norm": 1.4295841455459595, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8848788142204285, + "num_tokens": 743757992.0, + "step": 19495 + }, + { + "epoch": 2.480091591400585, + "grad_norm": 1.4576481580734253, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8937087059020996, + "num_tokens": 743799555.0, + "step": 19496 + }, + { + "epoch": 2.4802188016791757, + "grad_norm": 1.773645043373108, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8705121874809265, + "num_tokens": 743835124.0, + "step": 19497 + }, + { + "epoch": 2.4803460119577663, + "grad_norm": 1.4912463426589966, + "learning_rate": 1e-06, + "loss": 0.2737, + "mean_token_accuracy": 0.9013628959655762, + "num_tokens": 743873076.0, + "step": 19498 + }, + { + "epoch": 2.480473222236357, + "grad_norm": 1.5134696960449219, + "learning_rate": 1e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.896224319934845, + "num_tokens": 743908423.0, + "step": 19499 + }, + { + "epoch": 2.4806004325149473, + "grad_norm": 1.5478198528289795, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8800414800643921, + "num_tokens": 743947484.0, + "step": 19500 + }, + { + "epoch": 2.480727642793538, + "grad_norm": 1.5713317394256592, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.89091557264328, + "num_tokens": 743981978.0, + "step": 19501 + }, + { + "epoch": 2.4808548530721284, + "grad_norm": 1.5566025972366333, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8884871006011963, + "num_tokens": 744018641.0, + "step": 19502 + }, + { + "epoch": 2.480982063350719, + "grad_norm": 1.5741095542907715, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8837230205535889, + "num_tokens": 744057888.0, + "step": 19503 + }, + { + "epoch": 2.4811092736293094, + "grad_norm": 1.6271231174468994, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8811007738113403, + "num_tokens": 744095162.0, + "step": 19504 + }, + { + "epoch": 2.4812364839078995, + "grad_norm": 1.6333726644515991, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.8928549289703369, + "num_tokens": 744130187.0, + "step": 19505 + }, + { + "epoch": 2.4813636941864905, + "grad_norm": 1.511671543121338, + "learning_rate": 1e-06, + "loss": 0.2673, + "mean_token_accuracy": 0.9051820039749146, + "num_tokens": 744164335.0, + "step": 19506 + }, + { + "epoch": 2.4814909044650806, + "grad_norm": 1.6199746131896973, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.874647855758667, + "num_tokens": 744204319.0, + "step": 19507 + }, + { + "epoch": 2.481618114743671, + "grad_norm": 1.5700064897537231, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8827739953994751, + "num_tokens": 744240508.0, + "step": 19508 + }, + { + "epoch": 2.4817453250222616, + "grad_norm": 1.8073471784591675, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8690156936645508, + "num_tokens": 744270728.0, + "step": 19509 + }, + { + "epoch": 2.481872535300852, + "grad_norm": 1.8143550157546997, + "learning_rate": 1e-06, + "loss": 0.2664, + "mean_token_accuracy": 0.9020399451255798, + "num_tokens": 744302984.0, + "step": 19510 + }, + { + "epoch": 2.4819997455794427, + "grad_norm": 1.5463154315948486, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8637890219688416, + "num_tokens": 744347605.0, + "step": 19511 + }, + { + "epoch": 2.4821269558580332, + "grad_norm": 1.4845737218856812, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.878516674041748, + "num_tokens": 744392161.0, + "step": 19512 + }, + { + "epoch": 2.4822541661366238, + "grad_norm": 1.6018686294555664, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8874638080596924, + "num_tokens": 744427805.0, + "step": 19513 + }, + { + "epoch": 2.4823813764152143, + "grad_norm": 1.6778196096420288, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8916340470314026, + "num_tokens": 744458631.0, + "step": 19514 + }, + { + "epoch": 2.482508586693805, + "grad_norm": 1.6810044050216675, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8754635453224182, + "num_tokens": 744493350.0, + "step": 19515 + }, + { + "epoch": 2.4826357969723953, + "grad_norm": 1.6963449716567993, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8873523473739624, + "num_tokens": 744527950.0, + "step": 19516 + }, + { + "epoch": 2.482763007250986, + "grad_norm": 1.66853666305542, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8846269249916077, + "num_tokens": 744562100.0, + "step": 19517 + }, + { + "epoch": 2.4828902175295764, + "grad_norm": 1.4932161569595337, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8904517889022827, + "num_tokens": 744603451.0, + "step": 19518 + }, + { + "epoch": 2.483017427808167, + "grad_norm": 1.507373571395874, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8881168961524963, + "num_tokens": 744642780.0, + "step": 19519 + }, + { + "epoch": 2.4831446380867574, + "grad_norm": 1.4920005798339844, + "learning_rate": 1e-06, + "loss": 0.2783, + "mean_token_accuracy": 0.9026304483413696, + "num_tokens": 744680108.0, + "step": 19520 + }, + { + "epoch": 2.483271848365348, + "grad_norm": 1.6478780508041382, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8907836675643921, + "num_tokens": 744713197.0, + "step": 19521 + }, + { + "epoch": 2.4833990586439385, + "grad_norm": 1.6276943683624268, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8832686543464661, + "num_tokens": 744751215.0, + "step": 19522 + }, + { + "epoch": 2.483526268922529, + "grad_norm": 1.5680078268051147, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.881881058216095, + "num_tokens": 744790379.0, + "step": 19523 + }, + { + "epoch": 2.4836534792011196, + "grad_norm": 1.6448519229888916, + "learning_rate": 1e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.8989554643630981, + "num_tokens": 744821884.0, + "step": 19524 + }, + { + "epoch": 2.48378068947971, + "grad_norm": 1.6872252225875854, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8830385804176331, + "num_tokens": 744859413.0, + "step": 19525 + }, + { + "epoch": 2.4839078997583006, + "grad_norm": 1.5190036296844482, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8901525735855103, + "num_tokens": 744899187.0, + "step": 19526 + }, + { + "epoch": 2.484035110036891, + "grad_norm": 1.8709626197814941, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8610705733299255, + "num_tokens": 744940119.0, + "step": 19527 + }, + { + "epoch": 2.4841623203154817, + "grad_norm": 1.7267847061157227, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8771883845329285, + "num_tokens": 744972440.0, + "step": 19528 + }, + { + "epoch": 2.484289530594072, + "grad_norm": 1.454736590385437, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8829541206359863, + "num_tokens": 745013230.0, + "step": 19529 + }, + { + "epoch": 2.4844167408726623, + "grad_norm": 1.6479103565216064, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8869200944900513, + "num_tokens": 745047338.0, + "step": 19530 + }, + { + "epoch": 2.4845439511512533, + "grad_norm": 1.727734088897705, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8732753992080688, + "num_tokens": 745080862.0, + "step": 19531 + }, + { + "epoch": 2.4846711614298433, + "grad_norm": 1.4531267881393433, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8665086030960083, + "num_tokens": 745125364.0, + "step": 19532 + }, + { + "epoch": 2.484798371708434, + "grad_norm": 2.019212484359741, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8677923083305359, + "num_tokens": 745156283.0, + "step": 19533 + }, + { + "epoch": 2.4849255819870244, + "grad_norm": 1.6499629020690918, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8776835203170776, + "num_tokens": 745192864.0, + "step": 19534 + }, + { + "epoch": 2.485052792265615, + "grad_norm": 1.4816241264343262, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8880103826522827, + "num_tokens": 745232155.0, + "step": 19535 + }, + { + "epoch": 2.4851800025442055, + "grad_norm": 1.4391785860061646, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8874891996383667, + "num_tokens": 745274244.0, + "step": 19536 + }, + { + "epoch": 2.485307212822796, + "grad_norm": 1.568024754524231, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.895953357219696, + "num_tokens": 745312417.0, + "step": 19537 + }, + { + "epoch": 2.4854344231013865, + "grad_norm": 1.6505274772644043, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.885931134223938, + "num_tokens": 745349895.0, + "step": 19538 + }, + { + "epoch": 2.485561633379977, + "grad_norm": 1.7050532102584839, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8708844780921936, + "num_tokens": 745386772.0, + "step": 19539 + }, + { + "epoch": 2.4856888436585676, + "grad_norm": 1.5855523347854614, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8888264894485474, + "num_tokens": 745422115.0, + "step": 19540 + }, + { + "epoch": 2.485816053937158, + "grad_norm": 1.5806424617767334, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.8925647735595703, + "num_tokens": 745463380.0, + "step": 19541 + }, + { + "epoch": 2.4859432642157486, + "grad_norm": 1.588939905166626, + "learning_rate": 1e-06, + "loss": 0.2812, + "mean_token_accuracy": 0.8999982476234436, + "num_tokens": 745500098.0, + "step": 19542 + }, + { + "epoch": 2.486070474494339, + "grad_norm": 1.4673396348953247, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8858210444450378, + "num_tokens": 745545743.0, + "step": 19543 + }, + { + "epoch": 2.4861976847729297, + "grad_norm": 1.4764012098312378, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.8989212512969971, + "num_tokens": 745585095.0, + "step": 19544 + }, + { + "epoch": 2.48632489505152, + "grad_norm": 1.5090649127960205, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8884733319282532, + "num_tokens": 745620536.0, + "step": 19545 + }, + { + "epoch": 2.4864521053301107, + "grad_norm": 1.5273382663726807, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8940618634223938, + "num_tokens": 745658683.0, + "step": 19546 + }, + { + "epoch": 2.4865793156087013, + "grad_norm": 1.4880943298339844, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8763449788093567, + "num_tokens": 745701839.0, + "step": 19547 + }, + { + "epoch": 2.486706525887292, + "grad_norm": 1.6858820915222168, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8698194026947021, + "num_tokens": 745735300.0, + "step": 19548 + }, + { + "epoch": 2.4868337361658823, + "grad_norm": 1.5211650133132935, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8900087475776672, + "num_tokens": 745774506.0, + "step": 19549 + }, + { + "epoch": 2.486960946444473, + "grad_norm": 1.6854265928268433, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8884602189064026, + "num_tokens": 745807539.0, + "step": 19550 + }, + { + "epoch": 2.4870881567230634, + "grad_norm": 1.6016621589660645, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8754220008850098, + "num_tokens": 745843151.0, + "step": 19551 + }, + { + "epoch": 2.487215367001654, + "grad_norm": 1.5370888710021973, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8809512257575989, + "num_tokens": 745884605.0, + "step": 19552 + }, + { + "epoch": 2.4873425772802444, + "grad_norm": 1.4814976453781128, + "learning_rate": 1e-06, + "loss": 0.2575, + "mean_token_accuracy": 0.9074472188949585, + "num_tokens": 745925387.0, + "step": 19553 + }, + { + "epoch": 2.487469787558835, + "grad_norm": 1.5905125141143799, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8874650001525879, + "num_tokens": 745962445.0, + "step": 19554 + }, + { + "epoch": 2.487596997837425, + "grad_norm": 1.5542856454849243, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8948713541030884, + "num_tokens": 746000519.0, + "step": 19555 + }, + { + "epoch": 2.487724208116016, + "grad_norm": 1.6047664880752563, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8810441493988037, + "num_tokens": 746037946.0, + "step": 19556 + }, + { + "epoch": 2.487851418394606, + "grad_norm": 1.712288737297058, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8850753903388977, + "num_tokens": 746071136.0, + "step": 19557 + }, + { + "epoch": 2.4879786286731966, + "grad_norm": 1.4472222328186035, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8853535652160645, + "num_tokens": 746114162.0, + "step": 19558 + }, + { + "epoch": 2.488105838951787, + "grad_norm": 1.5230990648269653, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8955522775650024, + "num_tokens": 746149960.0, + "step": 19559 + }, + { + "epoch": 2.4882330492303777, + "grad_norm": 1.52316153049469, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8754119873046875, + "num_tokens": 746192120.0, + "step": 19560 + }, + { + "epoch": 2.488360259508968, + "grad_norm": 1.6183043718338013, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8678521513938904, + "num_tokens": 746231437.0, + "step": 19561 + }, + { + "epoch": 2.4884874697875587, + "grad_norm": 1.5745294094085693, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8832812309265137, + "num_tokens": 746268180.0, + "step": 19562 + }, + { + "epoch": 2.4886146800661493, + "grad_norm": 1.702404260635376, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8691891431808472, + "num_tokens": 746303329.0, + "step": 19563 + }, + { + "epoch": 2.48874189034474, + "grad_norm": 1.4700535535812378, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8901063203811646, + "num_tokens": 746343662.0, + "step": 19564 + }, + { + "epoch": 2.4888691006233303, + "grad_norm": 1.6738271713256836, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8840002417564392, + "num_tokens": 746375144.0, + "step": 19565 + }, + { + "epoch": 2.488996310901921, + "grad_norm": 1.57112455368042, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8673388361930847, + "num_tokens": 746418775.0, + "step": 19566 + }, + { + "epoch": 2.4891235211805114, + "grad_norm": 1.4948208332061768, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8922580480575562, + "num_tokens": 746459700.0, + "step": 19567 + }, + { + "epoch": 2.489250731459102, + "grad_norm": 1.5370558500289917, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8967196941375732, + "num_tokens": 746495961.0, + "step": 19568 + }, + { + "epoch": 2.4893779417376924, + "grad_norm": 1.5400054454803467, + "learning_rate": 1e-06, + "loss": 0.2791, + "mean_token_accuracy": 0.9000184535980225, + "num_tokens": 746530368.0, + "step": 19569 + }, + { + "epoch": 2.489505152016283, + "grad_norm": 1.5726522207260132, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8872073888778687, + "num_tokens": 746569512.0, + "step": 19570 + }, + { + "epoch": 2.4896323622948735, + "grad_norm": 1.4980260133743286, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8907402753829956, + "num_tokens": 746604563.0, + "step": 19571 + }, + { + "epoch": 2.489759572573464, + "grad_norm": 1.503200650215149, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8929502964019775, + "num_tokens": 746643210.0, + "step": 19572 + }, + { + "epoch": 2.4898867828520546, + "grad_norm": 1.506019115447998, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8828399777412415, + "num_tokens": 746684014.0, + "step": 19573 + }, + { + "epoch": 2.490013993130645, + "grad_norm": 1.650757074356079, + "learning_rate": 1e-06, + "loss": 0.2723, + "mean_token_accuracy": 0.8986073732376099, + "num_tokens": 746716183.0, + "step": 19574 + }, + { + "epoch": 2.4901412034092356, + "grad_norm": 1.4616574048995972, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8879836797714233, + "num_tokens": 746759842.0, + "step": 19575 + }, + { + "epoch": 2.490268413687826, + "grad_norm": 1.5736266374588013, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8862582445144653, + "num_tokens": 746794321.0, + "step": 19576 + }, + { + "epoch": 2.4903956239664167, + "grad_norm": 1.456292986869812, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8767877221107483, + "num_tokens": 746837761.0, + "step": 19577 + }, + { + "epoch": 2.4905228342450068, + "grad_norm": 1.5602357387542725, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8766105771064758, + "num_tokens": 746879259.0, + "step": 19578 + }, + { + "epoch": 2.4906500445235977, + "grad_norm": 1.6522043943405151, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8953436613082886, + "num_tokens": 746911312.0, + "step": 19579 + }, + { + "epoch": 2.490777254802188, + "grad_norm": 1.7130115032196045, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8796677589416504, + "num_tokens": 746946034.0, + "step": 19580 + }, + { + "epoch": 2.4909044650807783, + "grad_norm": 1.5449949502944946, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8821482062339783, + "num_tokens": 746986323.0, + "step": 19581 + }, + { + "epoch": 2.491031675359369, + "grad_norm": 1.572485327720642, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8846023082733154, + "num_tokens": 747021539.0, + "step": 19582 + }, + { + "epoch": 2.4911588856379594, + "grad_norm": 1.6561371088027954, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8897778987884521, + "num_tokens": 747056556.0, + "step": 19583 + }, + { + "epoch": 2.49128609591655, + "grad_norm": 1.4107943773269653, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8886277675628662, + "num_tokens": 747098333.0, + "step": 19584 + }, + { + "epoch": 2.4914133061951405, + "grad_norm": 1.554620385169983, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8909038305282593, + "num_tokens": 747134338.0, + "step": 19585 + }, + { + "epoch": 2.491540516473731, + "grad_norm": 1.4150484800338745, + "learning_rate": 1e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.8949933052062988, + "num_tokens": 747173166.0, + "step": 19586 + }, + { + "epoch": 2.4916677267523215, + "grad_norm": 1.5804977416992188, + "learning_rate": 1e-06, + "loss": 0.2692, + "mean_token_accuracy": 0.8992913961410522, + "num_tokens": 747205769.0, + "step": 19587 + }, + { + "epoch": 2.491794937030912, + "grad_norm": 1.518546462059021, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8837847113609314, + "num_tokens": 747246958.0, + "step": 19588 + }, + { + "epoch": 2.4919221473095026, + "grad_norm": 1.4739640951156616, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8869270086288452, + "num_tokens": 747290231.0, + "step": 19589 + }, + { + "epoch": 2.492049357588093, + "grad_norm": 1.5372151136398315, + "learning_rate": 1e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.8953144550323486, + "num_tokens": 747324278.0, + "step": 19590 + }, + { + "epoch": 2.4921765678666836, + "grad_norm": 1.7571598291397095, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8761476874351501, + "num_tokens": 747356394.0, + "step": 19591 + }, + { + "epoch": 2.492303778145274, + "grad_norm": 1.4399782419204712, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8957747220993042, + "num_tokens": 747399196.0, + "step": 19592 + }, + { + "epoch": 2.4924309884238647, + "grad_norm": 1.7117334604263306, + "learning_rate": 1e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.8981319665908813, + "num_tokens": 747429029.0, + "step": 19593 + }, + { + "epoch": 2.492558198702455, + "grad_norm": 1.5764130353927612, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8843719959259033, + "num_tokens": 747468988.0, + "step": 19594 + }, + { + "epoch": 2.4926854089810457, + "grad_norm": 1.5427498817443848, + "learning_rate": 1e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.8994131088256836, + "num_tokens": 747506982.0, + "step": 19595 + }, + { + "epoch": 2.4928126192596363, + "grad_norm": 1.6316877603530884, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8790411949157715, + "num_tokens": 747545382.0, + "step": 19596 + }, + { + "epoch": 2.492939829538227, + "grad_norm": 1.5608997344970703, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8816763162612915, + "num_tokens": 747584426.0, + "step": 19597 + }, + { + "epoch": 2.4930670398168173, + "grad_norm": 1.4106470346450806, + "learning_rate": 1e-06, + "loss": 0.2862, + "mean_token_accuracy": 0.8964812159538269, + "num_tokens": 747627912.0, + "step": 19598 + }, + { + "epoch": 2.493194250095408, + "grad_norm": 1.6358760595321655, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.88947993516922, + "num_tokens": 747662969.0, + "step": 19599 + }, + { + "epoch": 2.4933214603739984, + "grad_norm": 1.491234302520752, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.891842246055603, + "num_tokens": 747703657.0, + "step": 19600 + }, + { + "epoch": 2.493448670652589, + "grad_norm": 1.5716465711593628, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8920677900314331, + "num_tokens": 747741233.0, + "step": 19601 + }, + { + "epoch": 2.4935758809311794, + "grad_norm": 1.4822726249694824, + "learning_rate": 1e-06, + "loss": 0.2694, + "mean_token_accuracy": 0.8995073437690735, + "num_tokens": 747778963.0, + "step": 19602 + }, + { + "epoch": 2.4937030912097695, + "grad_norm": 1.5288662910461426, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8872360587120056, + "num_tokens": 747820943.0, + "step": 19603 + }, + { + "epoch": 2.4938303014883605, + "grad_norm": 1.646433711051941, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8726716637611389, + "num_tokens": 747858858.0, + "step": 19604 + }, + { + "epoch": 2.4939575117669506, + "grad_norm": 1.6132371425628662, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8921357989311218, + "num_tokens": 747891803.0, + "step": 19605 + }, + { + "epoch": 2.494084722045541, + "grad_norm": 1.4877097606658936, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.895339846611023, + "num_tokens": 747932736.0, + "step": 19606 + }, + { + "epoch": 2.4942119323241316, + "grad_norm": 1.4546258449554443, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8820126056671143, + "num_tokens": 747978810.0, + "step": 19607 + }, + { + "epoch": 2.494339142602722, + "grad_norm": 1.7123281955718994, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8750969171524048, + "num_tokens": 748013940.0, + "step": 19608 + }, + { + "epoch": 2.4944663528813127, + "grad_norm": 1.6692818403244019, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8796465396881104, + "num_tokens": 748050504.0, + "step": 19609 + }, + { + "epoch": 2.494593563159903, + "grad_norm": 1.6841849088668823, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8679874539375305, + "num_tokens": 748087807.0, + "step": 19610 + }, + { + "epoch": 2.4947207734384937, + "grad_norm": 1.5135023593902588, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8867213129997253, + "num_tokens": 748125444.0, + "step": 19611 + }, + { + "epoch": 2.4948479837170843, + "grad_norm": 1.4826191663742065, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8836579918861389, + "num_tokens": 748168420.0, + "step": 19612 + }, + { + "epoch": 2.494975193995675, + "grad_norm": 1.5250741243362427, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8844764232635498, + "num_tokens": 748207914.0, + "step": 19613 + }, + { + "epoch": 2.4951024042742653, + "grad_norm": 1.6122970581054688, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8887560367584229, + "num_tokens": 748243148.0, + "step": 19614 + }, + { + "epoch": 2.495229614552856, + "grad_norm": 1.7486454248428345, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8836604356765747, + "num_tokens": 748276970.0, + "step": 19615 + }, + { + "epoch": 2.4953568248314464, + "grad_norm": 1.5935171842575073, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.888717770576477, + "num_tokens": 748314691.0, + "step": 19616 + }, + { + "epoch": 2.495484035110037, + "grad_norm": 1.446816325187683, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8845818042755127, + "num_tokens": 748356963.0, + "step": 19617 + }, + { + "epoch": 2.4956112453886274, + "grad_norm": 1.6905838251113892, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8639361262321472, + "num_tokens": 748392000.0, + "step": 19618 + }, + { + "epoch": 2.495738455667218, + "grad_norm": 1.5839112997055054, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8851587176322937, + "num_tokens": 748428435.0, + "step": 19619 + }, + { + "epoch": 2.4958656659458085, + "grad_norm": 1.537065863609314, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8855018019676208, + "num_tokens": 748468799.0, + "step": 19620 + }, + { + "epoch": 2.495992876224399, + "grad_norm": 1.7860554456710815, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8759415149688721, + "num_tokens": 748499232.0, + "step": 19621 + }, + { + "epoch": 2.4961200865029896, + "grad_norm": 1.414815902709961, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8916010856628418, + "num_tokens": 748541223.0, + "step": 19622 + }, + { + "epoch": 2.49624729678158, + "grad_norm": 1.5402514934539795, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8832147121429443, + "num_tokens": 748582810.0, + "step": 19623 + }, + { + "epoch": 2.4963745070601706, + "grad_norm": 1.4885718822479248, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8983511924743652, + "num_tokens": 748622000.0, + "step": 19624 + }, + { + "epoch": 2.496501717338761, + "grad_norm": 1.4199565649032593, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8970277309417725, + "num_tokens": 748663820.0, + "step": 19625 + }, + { + "epoch": 2.4966289276173517, + "grad_norm": 1.6507014036178589, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8673985004425049, + "num_tokens": 748701180.0, + "step": 19626 + }, + { + "epoch": 2.496756137895942, + "grad_norm": 1.4312918186187744, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8943992853164673, + "num_tokens": 748743012.0, + "step": 19627 + }, + { + "epoch": 2.4968833481745323, + "grad_norm": 1.5214042663574219, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8846659660339355, + "num_tokens": 748783886.0, + "step": 19628 + }, + { + "epoch": 2.4970105584531233, + "grad_norm": 1.659019947052002, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.871328592300415, + "num_tokens": 748823084.0, + "step": 19629 + }, + { + "epoch": 2.4971377687317133, + "grad_norm": 1.5980240106582642, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8901082277297974, + "num_tokens": 748859348.0, + "step": 19630 + }, + { + "epoch": 2.497264979010304, + "grad_norm": 1.5422691106796265, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8880974054336548, + "num_tokens": 748898635.0, + "step": 19631 + }, + { + "epoch": 2.4973921892888944, + "grad_norm": 1.4968445301055908, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8810309171676636, + "num_tokens": 748938963.0, + "step": 19632 + }, + { + "epoch": 2.497519399567485, + "grad_norm": 1.7077113389968872, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8851244449615479, + "num_tokens": 748973048.0, + "step": 19633 + }, + { + "epoch": 2.4976466098460754, + "grad_norm": 1.5793291330337524, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.892719030380249, + "num_tokens": 749007899.0, + "step": 19634 + }, + { + "epoch": 2.497773820124666, + "grad_norm": 1.5149133205413818, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8867744207382202, + "num_tokens": 749048383.0, + "step": 19635 + }, + { + "epoch": 2.4979010304032565, + "grad_norm": 1.5679407119750977, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8943575024604797, + "num_tokens": 749086092.0, + "step": 19636 + }, + { + "epoch": 2.498028240681847, + "grad_norm": 1.5699576139450073, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8711968660354614, + "num_tokens": 749128314.0, + "step": 19637 + }, + { + "epoch": 2.4981554509604376, + "grad_norm": 1.4858932495117188, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8951045274734497, + "num_tokens": 749169438.0, + "step": 19638 + }, + { + "epoch": 2.498282661239028, + "grad_norm": 1.56390380859375, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8767189979553223, + "num_tokens": 749206654.0, + "step": 19639 + }, + { + "epoch": 2.4984098715176186, + "grad_norm": 1.631163477897644, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8805612921714783, + "num_tokens": 749243754.0, + "step": 19640 + }, + { + "epoch": 2.498537081796209, + "grad_norm": 1.4482539892196655, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.897723913192749, + "num_tokens": 749283871.0, + "step": 19641 + }, + { + "epoch": 2.4986642920747997, + "grad_norm": 1.6232106685638428, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8807889223098755, + "num_tokens": 749318965.0, + "step": 19642 + }, + { + "epoch": 2.49879150235339, + "grad_norm": 1.4867500066757202, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8919447064399719, + "num_tokens": 749359382.0, + "step": 19643 + }, + { + "epoch": 2.4989187126319807, + "grad_norm": 1.6059648990631104, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8815042972564697, + "num_tokens": 749394323.0, + "step": 19644 + }, + { + "epoch": 2.4990459229105713, + "grad_norm": 1.395555019378662, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8802279233932495, + "num_tokens": 749443460.0, + "step": 19645 + }, + { + "epoch": 2.499173133189162, + "grad_norm": 1.5784333944320679, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.889815092086792, + "num_tokens": 749484297.0, + "step": 19646 + }, + { + "epoch": 2.4993003434677523, + "grad_norm": 1.588908076286316, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8807240128517151, + "num_tokens": 749526802.0, + "step": 19647 + }, + { + "epoch": 2.499427553746343, + "grad_norm": 1.519646406173706, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8814246654510498, + "num_tokens": 749568404.0, + "step": 19648 + }, + { + "epoch": 2.4995547640249334, + "grad_norm": 1.5306059122085571, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8794812560081482, + "num_tokens": 749612709.0, + "step": 19649 + }, + { + "epoch": 2.499681974303524, + "grad_norm": 1.7638016939163208, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.8922350406646729, + "num_tokens": 749642965.0, + "step": 19650 + }, + { + "epoch": 2.499809184582114, + "grad_norm": 1.644516944885254, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8890963792800903, + "num_tokens": 749678313.0, + "step": 19651 + }, + { + "epoch": 2.499936394860705, + "grad_norm": 1.5324140787124634, + "learning_rate": 1e-06, + "loss": 0.2785, + "mean_token_accuracy": 0.8977973461151123, + "num_tokens": 749714445.0, + "step": 19652 + }, + { + "epoch": 2.500063605139295, + "grad_norm": 1.6506221294403076, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8788344860076904, + "num_tokens": 749751948.0, + "step": 19653 + }, + { + "epoch": 2.500190815417886, + "grad_norm": 1.6035329103469849, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8868039846420288, + "num_tokens": 749790594.0, + "step": 19654 + }, + { + "epoch": 2.500318025696476, + "grad_norm": 1.5474028587341309, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8829478025436401, + "num_tokens": 749829767.0, + "step": 19655 + }, + { + "epoch": 2.5004452359750666, + "grad_norm": 1.575140118598938, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8896684646606445, + "num_tokens": 749864635.0, + "step": 19656 + }, + { + "epoch": 2.500572446253657, + "grad_norm": 1.6287975311279297, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8711351156234741, + "num_tokens": 749906932.0, + "step": 19657 + }, + { + "epoch": 2.5006996565322477, + "grad_norm": 1.6471678018569946, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8765921592712402, + "num_tokens": 749944714.0, + "step": 19658 + }, + { + "epoch": 2.500826866810838, + "grad_norm": 1.479912519454956, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8927314281463623, + "num_tokens": 749981503.0, + "step": 19659 + }, + { + "epoch": 2.5009540770894287, + "grad_norm": 1.5304725170135498, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8742546439170837, + "num_tokens": 750024784.0, + "step": 19660 + }, + { + "epoch": 2.5010812873680193, + "grad_norm": 1.430100440979004, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.894706130027771, + "num_tokens": 750068034.0, + "step": 19661 + }, + { + "epoch": 2.50120849764661, + "grad_norm": 1.45729398727417, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8890593647956848, + "num_tokens": 750108563.0, + "step": 19662 + }, + { + "epoch": 2.5013357079252003, + "grad_norm": 1.5892534255981445, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8901581764221191, + "num_tokens": 750142108.0, + "step": 19663 + }, + { + "epoch": 2.501462918203791, + "grad_norm": 1.4638179540634155, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8900130987167358, + "num_tokens": 750181344.0, + "step": 19664 + }, + { + "epoch": 2.5015901284823814, + "grad_norm": 1.553317904472351, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8813745379447937, + "num_tokens": 750219485.0, + "step": 19665 + }, + { + "epoch": 2.501717338760972, + "grad_norm": 1.5236207246780396, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8857697248458862, + "num_tokens": 750263574.0, + "step": 19666 + }, + { + "epoch": 2.5018445490395624, + "grad_norm": 1.437752604484558, + "learning_rate": 1e-06, + "loss": 0.2817, + "mean_token_accuracy": 0.8991961479187012, + "num_tokens": 750302722.0, + "step": 19667 + }, + { + "epoch": 2.501971759318153, + "grad_norm": 1.4650325775146484, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8943057060241699, + "num_tokens": 750341916.0, + "step": 19668 + }, + { + "epoch": 2.5020989695967435, + "grad_norm": 1.4707309007644653, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8817397356033325, + "num_tokens": 750382710.0, + "step": 19669 + }, + { + "epoch": 2.502226179875334, + "grad_norm": 1.6632143259048462, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8723639249801636, + "num_tokens": 750417060.0, + "step": 19670 + }, + { + "epoch": 2.5023533901539246, + "grad_norm": 1.6674673557281494, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8871402740478516, + "num_tokens": 750449317.0, + "step": 19671 + }, + { + "epoch": 2.502480600432515, + "grad_norm": 1.5126897096633911, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8933608531951904, + "num_tokens": 750490517.0, + "step": 19672 + }, + { + "epoch": 2.5026078107111056, + "grad_norm": 1.4803181886672974, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8845793008804321, + "num_tokens": 750532761.0, + "step": 19673 + }, + { + "epoch": 2.5027350209896957, + "grad_norm": 1.494312047958374, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8756569027900696, + "num_tokens": 750577983.0, + "step": 19674 + }, + { + "epoch": 2.5028622312682867, + "grad_norm": 1.7056353092193604, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8831890821456909, + "num_tokens": 750611538.0, + "step": 19675 + }, + { + "epoch": 2.5029894415468767, + "grad_norm": 1.4287666082382202, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8874422907829285, + "num_tokens": 750654131.0, + "step": 19676 + }, + { + "epoch": 2.5031166518254677, + "grad_norm": 1.51064133644104, + "learning_rate": 1e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.8950832486152649, + "num_tokens": 750690103.0, + "step": 19677 + }, + { + "epoch": 2.503243862104058, + "grad_norm": 1.5492037534713745, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8851734399795532, + "num_tokens": 750727876.0, + "step": 19678 + }, + { + "epoch": 2.5033710723826488, + "grad_norm": 1.5289034843444824, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8950892686843872, + "num_tokens": 750764443.0, + "step": 19679 + }, + { + "epoch": 2.503498282661239, + "grad_norm": 1.6405482292175293, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8904247879981995, + "num_tokens": 750800046.0, + "step": 19680 + }, + { + "epoch": 2.5036254929398294, + "grad_norm": 1.5454100370407104, + "learning_rate": 1e-06, + "loss": 0.295, + "mean_token_accuracy": 0.8913152813911438, + "num_tokens": 750835948.0, + "step": 19681 + }, + { + "epoch": 2.50375270321842, + "grad_norm": 1.5755668878555298, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8802348375320435, + "num_tokens": 750876748.0, + "step": 19682 + }, + { + "epoch": 2.5038799134970104, + "grad_norm": 1.6086361408233643, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8755555152893066, + "num_tokens": 750918502.0, + "step": 19683 + }, + { + "epoch": 2.504007123775601, + "grad_norm": 1.7054245471954346, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.873605489730835, + "num_tokens": 750954206.0, + "step": 19684 + }, + { + "epoch": 2.5041343340541915, + "grad_norm": 1.6056548357009888, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8838434219360352, + "num_tokens": 750992326.0, + "step": 19685 + }, + { + "epoch": 2.504261544332782, + "grad_norm": 1.6435476541519165, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8864452838897705, + "num_tokens": 751026392.0, + "step": 19686 + }, + { + "epoch": 2.5043887546113726, + "grad_norm": 1.535622477531433, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8871615529060364, + "num_tokens": 751065764.0, + "step": 19687 + }, + { + "epoch": 2.504515964889963, + "grad_norm": 1.6566425561904907, + "learning_rate": 1e-06, + "loss": 0.2683, + "mean_token_accuracy": 0.9012410640716553, + "num_tokens": 751093941.0, + "step": 19688 + }, + { + "epoch": 2.5046431751685536, + "grad_norm": 1.564710021018982, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8918050527572632, + "num_tokens": 751131078.0, + "step": 19689 + }, + { + "epoch": 2.504770385447144, + "grad_norm": 1.4711905717849731, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8839351534843445, + "num_tokens": 751171718.0, + "step": 19690 + }, + { + "epoch": 2.5048975957257347, + "grad_norm": 1.4735239744186401, + "learning_rate": 1e-06, + "loss": 0.2775, + "mean_token_accuracy": 0.8981648683547974, + "num_tokens": 751206342.0, + "step": 19691 + }, + { + "epoch": 2.505024806004325, + "grad_norm": 1.5499404668807983, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8720003366470337, + "num_tokens": 751246560.0, + "step": 19692 + }, + { + "epoch": 2.5051520162829157, + "grad_norm": 1.730002999305725, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8690477609634399, + "num_tokens": 751283133.0, + "step": 19693 + }, + { + "epoch": 2.5052792265615063, + "grad_norm": 1.6479672193527222, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8873360753059387, + "num_tokens": 751318073.0, + "step": 19694 + }, + { + "epoch": 2.505406436840097, + "grad_norm": 1.756241798400879, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8723925948143005, + "num_tokens": 751351720.0, + "step": 19695 + }, + { + "epoch": 2.5055336471186873, + "grad_norm": 1.718167781829834, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8897030353546143, + "num_tokens": 751382350.0, + "step": 19696 + }, + { + "epoch": 2.505660857397278, + "grad_norm": 1.5577512979507446, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8909958004951477, + "num_tokens": 751417818.0, + "step": 19697 + }, + { + "epoch": 2.5057880676758684, + "grad_norm": 1.6213738918304443, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8949331641197205, + "num_tokens": 751451762.0, + "step": 19698 + }, + { + "epoch": 2.5059152779544585, + "grad_norm": 1.719486117362976, + "learning_rate": 1e-06, + "loss": 0.2721, + "mean_token_accuracy": 0.8991119265556335, + "num_tokens": 751481592.0, + "step": 19699 + }, + { + "epoch": 2.5060424882330494, + "grad_norm": 1.5351877212524414, + "learning_rate": 1e-06, + "loss": 0.2788, + "mean_token_accuracy": 0.8998626470565796, + "num_tokens": 751515946.0, + "step": 19700 + }, + { + "epoch": 2.5061696985116395, + "grad_norm": 1.5759389400482178, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8851808309555054, + "num_tokens": 751553398.0, + "step": 19701 + }, + { + "epoch": 2.5062969087902305, + "grad_norm": 1.4405252933502197, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8805795907974243, + "num_tokens": 751594942.0, + "step": 19702 + }, + { + "epoch": 2.5064241190688206, + "grad_norm": 1.5882073640823364, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8782635927200317, + "num_tokens": 751636219.0, + "step": 19703 + }, + { + "epoch": 2.5065513293474115, + "grad_norm": 1.4710710048675537, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8771743774414062, + "num_tokens": 751680587.0, + "step": 19704 + }, + { + "epoch": 2.5066785396260016, + "grad_norm": 1.5961803197860718, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.890458881855011, + "num_tokens": 751717419.0, + "step": 19705 + }, + { + "epoch": 2.506805749904592, + "grad_norm": 1.4826406240463257, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.88755202293396, + "num_tokens": 751758720.0, + "step": 19706 + }, + { + "epoch": 2.5069329601831827, + "grad_norm": 1.7297065258026123, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8888767957687378, + "num_tokens": 751792260.0, + "step": 19707 + }, + { + "epoch": 2.507060170461773, + "grad_norm": 1.7540476322174072, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8721597790718079, + "num_tokens": 751827048.0, + "step": 19708 + }, + { + "epoch": 2.5071873807403637, + "grad_norm": 1.5644563436508179, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8941465616226196, + "num_tokens": 751864019.0, + "step": 19709 + }, + { + "epoch": 2.5073145910189543, + "grad_norm": 1.7920163869857788, + "learning_rate": 1e-06, + "loss": 0.2757, + "mean_token_accuracy": 0.8978933095932007, + "num_tokens": 751892113.0, + "step": 19710 + }, + { + "epoch": 2.507441801297545, + "grad_norm": 1.5127965211868286, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8901866674423218, + "num_tokens": 751932726.0, + "step": 19711 + }, + { + "epoch": 2.5075690115761353, + "grad_norm": 1.5940861701965332, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8847466707229614, + "num_tokens": 751971679.0, + "step": 19712 + }, + { + "epoch": 2.507696221854726, + "grad_norm": 1.585677981376648, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8772927522659302, + "num_tokens": 752011644.0, + "step": 19713 + }, + { + "epoch": 2.5078234321333164, + "grad_norm": 1.554921269416809, + "learning_rate": 1e-06, + "loss": 0.279, + "mean_token_accuracy": 0.9044520854949951, + "num_tokens": 752047105.0, + "step": 19714 + }, + { + "epoch": 2.507950642411907, + "grad_norm": 1.5443071126937866, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8821811676025391, + "num_tokens": 752087110.0, + "step": 19715 + }, + { + "epoch": 2.5080778526904974, + "grad_norm": 1.518963098526001, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8762763142585754, + "num_tokens": 752127237.0, + "step": 19716 + }, + { + "epoch": 2.508205062969088, + "grad_norm": 1.6567133665084839, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8618991374969482, + "num_tokens": 752165981.0, + "step": 19717 + }, + { + "epoch": 2.5083322732476785, + "grad_norm": 1.5032920837402344, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.88551926612854, + "num_tokens": 752203627.0, + "step": 19718 + }, + { + "epoch": 2.508459483526269, + "grad_norm": 1.5700325965881348, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8750913143157959, + "num_tokens": 752243338.0, + "step": 19719 + }, + { + "epoch": 2.5085866938048595, + "grad_norm": 1.559141993522644, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8804453015327454, + "num_tokens": 752283836.0, + "step": 19720 + }, + { + "epoch": 2.50871390408345, + "grad_norm": 1.7430799007415771, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8740523457527161, + "num_tokens": 752316191.0, + "step": 19721 + }, + { + "epoch": 2.5088411143620406, + "grad_norm": 1.5149949789047241, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8881305456161499, + "num_tokens": 752355417.0, + "step": 19722 + }, + { + "epoch": 2.508968324640631, + "grad_norm": 1.5795009136199951, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8657538890838623, + "num_tokens": 752398340.0, + "step": 19723 + }, + { + "epoch": 2.509095534919221, + "grad_norm": 1.639835000038147, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8894158601760864, + "num_tokens": 752432637.0, + "step": 19724 + }, + { + "epoch": 2.509222745197812, + "grad_norm": 1.6054141521453857, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8763943314552307, + "num_tokens": 752471242.0, + "step": 19725 + }, + { + "epoch": 2.5093499554764023, + "grad_norm": 1.5220528841018677, + "learning_rate": 1e-06, + "loss": 0.2593, + "mean_token_accuracy": 0.9019834399223328, + "num_tokens": 752506508.0, + "step": 19726 + }, + { + "epoch": 2.5094771657549932, + "grad_norm": 1.468593716621399, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.882370114326477, + "num_tokens": 752549472.0, + "step": 19727 + }, + { + "epoch": 2.5096043760335833, + "grad_norm": 1.5074759721755981, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8862961530685425, + "num_tokens": 752585042.0, + "step": 19728 + }, + { + "epoch": 2.5097315863121743, + "grad_norm": 1.6192377805709839, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8787391185760498, + "num_tokens": 752623558.0, + "step": 19729 + }, + { + "epoch": 2.5098587965907644, + "grad_norm": 1.5824815034866333, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8910783529281616, + "num_tokens": 752666692.0, + "step": 19730 + }, + { + "epoch": 2.509986006869355, + "grad_norm": 1.4903819561004639, + "learning_rate": 1e-06, + "loss": 0.2849, + "mean_token_accuracy": 0.8955349922180176, + "num_tokens": 752706304.0, + "step": 19731 + }, + { + "epoch": 2.5101132171479454, + "grad_norm": 1.610795259475708, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8882365226745605, + "num_tokens": 752743634.0, + "step": 19732 + }, + { + "epoch": 2.510240427426536, + "grad_norm": 1.6748623847961426, + "learning_rate": 1e-06, + "loss": 0.292, + "mean_token_accuracy": 0.8902926445007324, + "num_tokens": 752774013.0, + "step": 19733 + }, + { + "epoch": 2.5103676377051265, + "grad_norm": 1.4737321138381958, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8801658749580383, + "num_tokens": 752811336.0, + "step": 19734 + }, + { + "epoch": 2.510494847983717, + "grad_norm": 1.513594388961792, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8875914812088013, + "num_tokens": 752850096.0, + "step": 19735 + }, + { + "epoch": 2.5106220582623076, + "grad_norm": 1.564528465270996, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8918223977088928, + "num_tokens": 752888747.0, + "step": 19736 + }, + { + "epoch": 2.510749268540898, + "grad_norm": 1.6492584943771362, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8690577745437622, + "num_tokens": 752925385.0, + "step": 19737 + }, + { + "epoch": 2.5108764788194886, + "grad_norm": 1.5698872804641724, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8721494674682617, + "num_tokens": 752969638.0, + "step": 19738 + }, + { + "epoch": 2.511003689098079, + "grad_norm": 1.5568115711212158, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8851828575134277, + "num_tokens": 753005236.0, + "step": 19739 + }, + { + "epoch": 2.5111308993766697, + "grad_norm": 1.3763476610183716, + "learning_rate": 1e-06, + "loss": 0.271, + "mean_token_accuracy": 0.9016426801681519, + "num_tokens": 753048508.0, + "step": 19740 + }, + { + "epoch": 2.51125810965526, + "grad_norm": 1.5752986669540405, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8729894757270813, + "num_tokens": 753089720.0, + "step": 19741 + }, + { + "epoch": 2.5113853199338507, + "grad_norm": 1.5536770820617676, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8870395421981812, + "num_tokens": 753128545.0, + "step": 19742 + }, + { + "epoch": 2.5115125302124413, + "grad_norm": 1.5838991403579712, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8682563900947571, + "num_tokens": 753170128.0, + "step": 19743 + }, + { + "epoch": 2.511639740491032, + "grad_norm": 1.6462883949279785, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8891997933387756, + "num_tokens": 753204972.0, + "step": 19744 + }, + { + "epoch": 2.5117669507696223, + "grad_norm": 1.546458125114441, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.883091151714325, + "num_tokens": 753243405.0, + "step": 19745 + }, + { + "epoch": 2.511894161048213, + "grad_norm": 1.7028334140777588, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8734760284423828, + "num_tokens": 753276797.0, + "step": 19746 + }, + { + "epoch": 2.5120213713268034, + "grad_norm": 1.5652518272399902, + "learning_rate": 1e-06, + "loss": 0.2672, + "mean_token_accuracy": 0.9002974033355713, + "num_tokens": 753313034.0, + "step": 19747 + }, + { + "epoch": 2.512148581605394, + "grad_norm": 1.580051302909851, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8889528512954712, + "num_tokens": 753352166.0, + "step": 19748 + }, + { + "epoch": 2.512275791883984, + "grad_norm": 1.6022850275039673, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8869068622589111, + "num_tokens": 753389153.0, + "step": 19749 + }, + { + "epoch": 2.512403002162575, + "grad_norm": 1.5252186059951782, + "learning_rate": 1e-06, + "loss": 0.2827, + "mean_token_accuracy": 0.8978376388549805, + "num_tokens": 753425379.0, + "step": 19750 + }, + { + "epoch": 2.512530212441165, + "grad_norm": 1.5595202445983887, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8813847303390503, + "num_tokens": 753465606.0, + "step": 19751 + }, + { + "epoch": 2.512657422719756, + "grad_norm": 1.4568803310394287, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8900364637374878, + "num_tokens": 753504987.0, + "step": 19752 + }, + { + "epoch": 2.512784632998346, + "grad_norm": 1.4328409433364868, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8870048522949219, + "num_tokens": 753549588.0, + "step": 19753 + }, + { + "epoch": 2.5129118432769366, + "grad_norm": 1.5501960515975952, + "learning_rate": 1e-06, + "loss": 0.295, + "mean_token_accuracy": 0.8897562623023987, + "num_tokens": 753585628.0, + "step": 19754 + }, + { + "epoch": 2.513039053555527, + "grad_norm": 1.480236530303955, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8884240984916687, + "num_tokens": 753626200.0, + "step": 19755 + }, + { + "epoch": 2.5131662638341177, + "grad_norm": 1.7371443510055542, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8830051422119141, + "num_tokens": 753662035.0, + "step": 19756 + }, + { + "epoch": 2.513293474112708, + "grad_norm": 1.5629780292510986, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8792827725410461, + "num_tokens": 753699823.0, + "step": 19757 + }, + { + "epoch": 2.5134206843912987, + "grad_norm": 1.618004322052002, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8771322965621948, + "num_tokens": 753738131.0, + "step": 19758 + }, + { + "epoch": 2.5135478946698893, + "grad_norm": 1.464544653892517, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8899307250976562, + "num_tokens": 753776062.0, + "step": 19759 + }, + { + "epoch": 2.51367510494848, + "grad_norm": 1.6739249229431152, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8956208229064941, + "num_tokens": 753807717.0, + "step": 19760 + }, + { + "epoch": 2.5138023152270703, + "grad_norm": 1.5746978521347046, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8740143775939941, + "num_tokens": 753847613.0, + "step": 19761 + }, + { + "epoch": 2.513929525505661, + "grad_norm": 1.5924181938171387, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8714302778244019, + "num_tokens": 753885578.0, + "step": 19762 + }, + { + "epoch": 2.5140567357842514, + "grad_norm": 1.5673713684082031, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8846492171287537, + "num_tokens": 753923122.0, + "step": 19763 + }, + { + "epoch": 2.514183946062842, + "grad_norm": 1.4601881504058838, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8814142942428589, + "num_tokens": 753968881.0, + "step": 19764 + }, + { + "epoch": 2.5143111563414324, + "grad_norm": 1.5415304899215698, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8733762502670288, + "num_tokens": 754011808.0, + "step": 19765 + }, + { + "epoch": 2.514438366620023, + "grad_norm": 1.5477900505065918, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8966590762138367, + "num_tokens": 754045726.0, + "step": 19766 + }, + { + "epoch": 2.5145655768986135, + "grad_norm": 1.4580472707748413, + "learning_rate": 1e-06, + "loss": 0.282, + "mean_token_accuracy": 0.8972103595733643, + "num_tokens": 754085931.0, + "step": 19767 + }, + { + "epoch": 2.514692787177204, + "grad_norm": 1.5409247875213623, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8750079274177551, + "num_tokens": 754126140.0, + "step": 19768 + }, + { + "epoch": 2.5148199974557945, + "grad_norm": 1.847298502922058, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8618713617324829, + "num_tokens": 754159992.0, + "step": 19769 + }, + { + "epoch": 2.514947207734385, + "grad_norm": 1.7033942937850952, + "learning_rate": 1e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.8980188369750977, + "num_tokens": 754191654.0, + "step": 19770 + }, + { + "epoch": 2.5150744180129756, + "grad_norm": 1.6108715534210205, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8948739171028137, + "num_tokens": 754230372.0, + "step": 19771 + }, + { + "epoch": 2.5152016282915657, + "grad_norm": 1.621798038482666, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8814404606819153, + "num_tokens": 754267052.0, + "step": 19772 + }, + { + "epoch": 2.5153288385701567, + "grad_norm": 1.4639840126037598, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8914598822593689, + "num_tokens": 754308536.0, + "step": 19773 + }, + { + "epoch": 2.5154560488487467, + "grad_norm": 1.5299962759017944, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8914426565170288, + "num_tokens": 754345277.0, + "step": 19774 + }, + { + "epoch": 2.5155832591273377, + "grad_norm": 1.5475258827209473, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8925564289093018, + "num_tokens": 754384297.0, + "step": 19775 + }, + { + "epoch": 2.515710469405928, + "grad_norm": 1.5897653102874756, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8850761651992798, + "num_tokens": 754424021.0, + "step": 19776 + }, + { + "epoch": 2.5158376796845188, + "grad_norm": 1.496084451675415, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8905999660491943, + "num_tokens": 754461549.0, + "step": 19777 + }, + { + "epoch": 2.515964889963109, + "grad_norm": 1.5320674180984497, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8953825831413269, + "num_tokens": 754498592.0, + "step": 19778 + }, + { + "epoch": 2.5160921002416994, + "grad_norm": 1.5027658939361572, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8866091966629028, + "num_tokens": 754541064.0, + "step": 19779 + }, + { + "epoch": 2.51621931052029, + "grad_norm": 1.5686448812484741, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8736142516136169, + "num_tokens": 754581050.0, + "step": 19780 + }, + { + "epoch": 2.5163465207988804, + "grad_norm": 1.5936168432235718, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8814622163772583, + "num_tokens": 754617986.0, + "step": 19781 + }, + { + "epoch": 2.516473731077471, + "grad_norm": 1.5868620872497559, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8784430623054504, + "num_tokens": 754655235.0, + "step": 19782 + }, + { + "epoch": 2.5166009413560615, + "grad_norm": 1.535220742225647, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8869187831878662, + "num_tokens": 754695794.0, + "step": 19783 + }, + { + "epoch": 2.516728151634652, + "grad_norm": 1.693455696105957, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8918226957321167, + "num_tokens": 754730044.0, + "step": 19784 + }, + { + "epoch": 2.5168553619132426, + "grad_norm": 1.4459198713302612, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.881711483001709, + "num_tokens": 754771389.0, + "step": 19785 + }, + { + "epoch": 2.516982572191833, + "grad_norm": 1.5600117444992065, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8941141366958618, + "num_tokens": 754804896.0, + "step": 19786 + }, + { + "epoch": 2.5171097824704236, + "grad_norm": 1.6346659660339355, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8883994817733765, + "num_tokens": 754840349.0, + "step": 19787 + }, + { + "epoch": 2.517236992749014, + "grad_norm": 1.5253498554229736, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.8942269086837769, + "num_tokens": 754880497.0, + "step": 19788 + }, + { + "epoch": 2.5173642030276047, + "grad_norm": 1.5981162786483765, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8915582299232483, + "num_tokens": 754916287.0, + "step": 19789 + }, + { + "epoch": 2.517491413306195, + "grad_norm": 1.5385066270828247, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8934006094932556, + "num_tokens": 754951416.0, + "step": 19790 + }, + { + "epoch": 2.5176186235847857, + "grad_norm": 1.4118725061416626, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8869327306747437, + "num_tokens": 754995756.0, + "step": 19791 + }, + { + "epoch": 2.5177458338633762, + "grad_norm": 1.656430959701538, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8796782493591309, + "num_tokens": 755034561.0, + "step": 19792 + }, + { + "epoch": 2.5178730441419668, + "grad_norm": 1.4336445331573486, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8947384357452393, + "num_tokens": 755074063.0, + "step": 19793 + }, + { + "epoch": 2.5180002544205573, + "grad_norm": 1.4577776193618774, + "learning_rate": 1e-06, + "loss": 0.2676, + "mean_token_accuracy": 0.9024908542633057, + "num_tokens": 755111537.0, + "step": 19794 + }, + { + "epoch": 2.518127464699148, + "grad_norm": 1.475610375404358, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8865195512771606, + "num_tokens": 755155411.0, + "step": 19795 + }, + { + "epoch": 2.5182546749777384, + "grad_norm": 1.535702109336853, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8865697979927063, + "num_tokens": 755191841.0, + "step": 19796 + }, + { + "epoch": 2.5183818852563284, + "grad_norm": 1.501079797744751, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8879293203353882, + "num_tokens": 755229881.0, + "step": 19797 + }, + { + "epoch": 2.5185090955349194, + "grad_norm": 1.4297380447387695, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8885007500648499, + "num_tokens": 755270933.0, + "step": 19798 + }, + { + "epoch": 2.5186363058135095, + "grad_norm": 1.5655330419540405, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8872115612030029, + "num_tokens": 755307788.0, + "step": 19799 + }, + { + "epoch": 2.5187635160921005, + "grad_norm": 1.601006269454956, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8869564533233643, + "num_tokens": 755342515.0, + "step": 19800 + }, + { + "epoch": 2.5188907263706906, + "grad_norm": 1.523697853088379, + "learning_rate": 1e-06, + "loss": 0.2672, + "mean_token_accuracy": 0.9023662805557251, + "num_tokens": 755377286.0, + "step": 19801 + }, + { + "epoch": 2.5190179366492815, + "grad_norm": 1.6513864994049072, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8789565563201904, + "num_tokens": 755418695.0, + "step": 19802 + }, + { + "epoch": 2.5191451469278716, + "grad_norm": 1.4402589797973633, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.8947343230247498, + "num_tokens": 755461720.0, + "step": 19803 + }, + { + "epoch": 2.519272357206462, + "grad_norm": 1.5647746324539185, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8839279413223267, + "num_tokens": 755500955.0, + "step": 19804 + }, + { + "epoch": 2.5193995674850527, + "grad_norm": 1.399009108543396, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8832684755325317, + "num_tokens": 755547956.0, + "step": 19805 + }, + { + "epoch": 2.519526777763643, + "grad_norm": 1.8414306640625, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.886770486831665, + "num_tokens": 755576855.0, + "step": 19806 + }, + { + "epoch": 2.5196539880422337, + "grad_norm": 1.5562963485717773, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8856877088546753, + "num_tokens": 755616137.0, + "step": 19807 + }, + { + "epoch": 2.5197811983208243, + "grad_norm": 1.4278244972229004, + "learning_rate": 1e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.8961376547813416, + "num_tokens": 755655150.0, + "step": 19808 + }, + { + "epoch": 2.519908408599415, + "grad_norm": 1.5366113185882568, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8877196311950684, + "num_tokens": 755697725.0, + "step": 19809 + }, + { + "epoch": 2.5200356188780053, + "grad_norm": 1.6397937536239624, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8784008026123047, + "num_tokens": 755734172.0, + "step": 19810 + }, + { + "epoch": 2.520162829156596, + "grad_norm": 1.6693105697631836, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8830908536911011, + "num_tokens": 755769302.0, + "step": 19811 + }, + { + "epoch": 2.5202900394351864, + "grad_norm": 1.5197043418884277, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8837172985076904, + "num_tokens": 755808709.0, + "step": 19812 + }, + { + "epoch": 2.520417249713777, + "grad_norm": 1.660467267036438, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8761357069015503, + "num_tokens": 755847200.0, + "step": 19813 + }, + { + "epoch": 2.5205444599923674, + "grad_norm": 1.546112060546875, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8772872686386108, + "num_tokens": 755892354.0, + "step": 19814 + }, + { + "epoch": 2.520671670270958, + "grad_norm": 1.5664708614349365, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8903087377548218, + "num_tokens": 755929583.0, + "step": 19815 + }, + { + "epoch": 2.5207988805495485, + "grad_norm": 1.4351505041122437, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8795449137687683, + "num_tokens": 755972322.0, + "step": 19816 + }, + { + "epoch": 2.520926090828139, + "grad_norm": 1.5297240018844604, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8886377811431885, + "num_tokens": 756010300.0, + "step": 19817 + }, + { + "epoch": 2.5210533011067295, + "grad_norm": 1.4395842552185059, + "learning_rate": 1e-06, + "loss": 0.266, + "mean_token_accuracy": 0.9049607515335083, + "num_tokens": 756047476.0, + "step": 19818 + }, + { + "epoch": 2.52118051138532, + "grad_norm": 1.5072107315063477, + "learning_rate": 1e-06, + "loss": 0.274, + "mean_token_accuracy": 0.8986361026763916, + "num_tokens": 756082541.0, + "step": 19819 + }, + { + "epoch": 2.5213077216639106, + "grad_norm": 1.5037310123443604, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.889343798160553, + "num_tokens": 756122246.0, + "step": 19820 + }, + { + "epoch": 2.521434931942501, + "grad_norm": 1.4999486207962036, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8924538493156433, + "num_tokens": 756162152.0, + "step": 19821 + }, + { + "epoch": 2.521562142221091, + "grad_norm": 1.5867743492126465, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8825640678405762, + "num_tokens": 756199410.0, + "step": 19822 + }, + { + "epoch": 2.521689352499682, + "grad_norm": 1.6271088123321533, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.884398341178894, + "num_tokens": 756235800.0, + "step": 19823 + }, + { + "epoch": 2.5218165627782723, + "grad_norm": 1.4594119787216187, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8950175642967224, + "num_tokens": 756275200.0, + "step": 19824 + }, + { + "epoch": 2.5219437730568632, + "grad_norm": 1.4909414052963257, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8800145983695984, + "num_tokens": 756315932.0, + "step": 19825 + }, + { + "epoch": 2.5220709833354533, + "grad_norm": 1.537221074104309, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8788087368011475, + "num_tokens": 756361621.0, + "step": 19826 + }, + { + "epoch": 2.522198193614044, + "grad_norm": 1.8105322122573853, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8845025897026062, + "num_tokens": 756391666.0, + "step": 19827 + }, + { + "epoch": 2.5223254038926344, + "grad_norm": 1.4905660152435303, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8825341463088989, + "num_tokens": 756431578.0, + "step": 19828 + }, + { + "epoch": 2.522452614171225, + "grad_norm": 1.7256516218185425, + "learning_rate": 1e-06, + "loss": 0.2836, + "mean_token_accuracy": 0.8959903120994568, + "num_tokens": 756463948.0, + "step": 19829 + }, + { + "epoch": 2.5225798244498154, + "grad_norm": 1.5276581048965454, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8860254287719727, + "num_tokens": 756507157.0, + "step": 19830 + }, + { + "epoch": 2.522707034728406, + "grad_norm": 1.5520315170288086, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8924253582954407, + "num_tokens": 756545858.0, + "step": 19831 + }, + { + "epoch": 2.5228342450069965, + "grad_norm": 1.4682515859603882, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8835861682891846, + "num_tokens": 756584489.0, + "step": 19832 + }, + { + "epoch": 2.522961455285587, + "grad_norm": 1.6325838565826416, + "learning_rate": 1e-06, + "loss": 0.277, + "mean_token_accuracy": 0.8950551748275757, + "num_tokens": 756617218.0, + "step": 19833 + }, + { + "epoch": 2.5230886655641775, + "grad_norm": 1.6068230867385864, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8817107677459717, + "num_tokens": 756656805.0, + "step": 19834 + }, + { + "epoch": 2.523215875842768, + "grad_norm": 1.712716817855835, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8837627172470093, + "num_tokens": 756696715.0, + "step": 19835 + }, + { + "epoch": 2.5233430861213586, + "grad_norm": 1.5310654640197754, + "learning_rate": 1e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.8934468030929565, + "num_tokens": 756733626.0, + "step": 19836 + }, + { + "epoch": 2.523470296399949, + "grad_norm": 1.6679561138153076, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8893980979919434, + "num_tokens": 756764847.0, + "step": 19837 + }, + { + "epoch": 2.5235975066785397, + "grad_norm": 1.5879637002944946, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8700546026229858, + "num_tokens": 756808718.0, + "step": 19838 + }, + { + "epoch": 2.52372471695713, + "grad_norm": 1.480194330215454, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8934142589569092, + "num_tokens": 756849099.0, + "step": 19839 + }, + { + "epoch": 2.5238519272357207, + "grad_norm": 1.541947603225708, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8917235732078552, + "num_tokens": 756888652.0, + "step": 19840 + }, + { + "epoch": 2.5239791375143112, + "grad_norm": 1.774488925933838, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8913519978523254, + "num_tokens": 756920512.0, + "step": 19841 + }, + { + "epoch": 2.5241063477929018, + "grad_norm": 1.5018001794815063, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.878517210483551, + "num_tokens": 756964525.0, + "step": 19842 + }, + { + "epoch": 2.5242335580714923, + "grad_norm": 1.5645127296447754, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8863522410392761, + "num_tokens": 757000700.0, + "step": 19843 + }, + { + "epoch": 2.524360768350083, + "grad_norm": 1.5405211448669434, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8843986988067627, + "num_tokens": 757039664.0, + "step": 19844 + }, + { + "epoch": 2.5244879786286734, + "grad_norm": 1.6200019121170044, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8744888305664062, + "num_tokens": 757078144.0, + "step": 19845 + }, + { + "epoch": 2.524615188907264, + "grad_norm": 1.5485836267471313, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8892834186553955, + "num_tokens": 757114833.0, + "step": 19846 + }, + { + "epoch": 2.524742399185854, + "grad_norm": 1.5781660079956055, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8826897740364075, + "num_tokens": 757156051.0, + "step": 19847 + }, + { + "epoch": 2.524869609464445, + "grad_norm": 1.5470359325408936, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8902277946472168, + "num_tokens": 757196147.0, + "step": 19848 + }, + { + "epoch": 2.524996819743035, + "grad_norm": 1.6519628763198853, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8907665610313416, + "num_tokens": 757228638.0, + "step": 19849 + }, + { + "epoch": 2.525124030021626, + "grad_norm": 1.5473264455795288, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8783456087112427, + "num_tokens": 757267444.0, + "step": 19850 + }, + { + "epoch": 2.525251240300216, + "grad_norm": 1.5626344680786133, + "learning_rate": 1e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.8945114016532898, + "num_tokens": 757302079.0, + "step": 19851 + }, + { + "epoch": 2.5253784505788066, + "grad_norm": 1.4490734338760376, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8921926617622375, + "num_tokens": 757342974.0, + "step": 19852 + }, + { + "epoch": 2.525505660857397, + "grad_norm": 1.708883285522461, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8773096203804016, + "num_tokens": 757379555.0, + "step": 19853 + }, + { + "epoch": 2.5256328711359877, + "grad_norm": 1.7215672731399536, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8737103939056396, + "num_tokens": 757415755.0, + "step": 19854 + }, + { + "epoch": 2.525760081414578, + "grad_norm": 1.536433219909668, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8883866667747498, + "num_tokens": 757450213.0, + "step": 19855 + }, + { + "epoch": 2.5258872916931687, + "grad_norm": 1.4932082891464233, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.881306529045105, + "num_tokens": 757493484.0, + "step": 19856 + }, + { + "epoch": 2.5260145019717593, + "grad_norm": 1.4368560314178467, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8933453559875488, + "num_tokens": 757535895.0, + "step": 19857 + }, + { + "epoch": 2.52614171225035, + "grad_norm": 1.6957285404205322, + "learning_rate": 1e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.8969300389289856, + "num_tokens": 757566984.0, + "step": 19858 + }, + { + "epoch": 2.5262689225289403, + "grad_norm": 1.5614039897918701, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8960826396942139, + "num_tokens": 757603707.0, + "step": 19859 + }, + { + "epoch": 2.526396132807531, + "grad_norm": 1.4681490659713745, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.888567328453064, + "num_tokens": 757640415.0, + "step": 19860 + }, + { + "epoch": 2.5265233430861214, + "grad_norm": 1.4645915031433105, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8939653635025024, + "num_tokens": 757681247.0, + "step": 19861 + }, + { + "epoch": 2.526650553364712, + "grad_norm": 1.6219056844711304, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8837674856185913, + "num_tokens": 757717612.0, + "step": 19862 + }, + { + "epoch": 2.5267777636433024, + "grad_norm": 1.5430189371109009, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8857274055480957, + "num_tokens": 757762314.0, + "step": 19863 + }, + { + "epoch": 2.526904973921893, + "grad_norm": 1.5517922639846802, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8947635889053345, + "num_tokens": 757800051.0, + "step": 19864 + }, + { + "epoch": 2.5270321842004835, + "grad_norm": 1.5178463459014893, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8886840343475342, + "num_tokens": 757838631.0, + "step": 19865 + }, + { + "epoch": 2.527159394479074, + "grad_norm": 1.5555670261383057, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8937071561813354, + "num_tokens": 757874761.0, + "step": 19866 + }, + { + "epoch": 2.5272866047576645, + "grad_norm": 1.6248587369918823, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8718421459197998, + "num_tokens": 757910756.0, + "step": 19867 + }, + { + "epoch": 2.527413815036255, + "grad_norm": 1.6053626537322998, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8730752468109131, + "num_tokens": 757948270.0, + "step": 19868 + }, + { + "epoch": 2.5275410253148456, + "grad_norm": 1.5532076358795166, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8932944536209106, + "num_tokens": 757980753.0, + "step": 19869 + }, + { + "epoch": 2.5276682355934357, + "grad_norm": 1.588335394859314, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8862797021865845, + "num_tokens": 758021728.0, + "step": 19870 + }, + { + "epoch": 2.5277954458720266, + "grad_norm": 1.55951988697052, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8852566480636597, + "num_tokens": 758059588.0, + "step": 19871 + }, + { + "epoch": 2.5279226561506167, + "grad_norm": 1.499534249305725, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8918101787567139, + "num_tokens": 758097866.0, + "step": 19872 + }, + { + "epoch": 2.5280498664292077, + "grad_norm": 1.475799560546875, + "learning_rate": 1e-06, + "loss": 0.2691, + "mean_token_accuracy": 0.9011963605880737, + "num_tokens": 758136980.0, + "step": 19873 + }, + { + "epoch": 2.528177076707798, + "grad_norm": 1.6184096336364746, + "learning_rate": 1e-06, + "loss": 0.2778, + "mean_token_accuracy": 0.899212658405304, + "num_tokens": 758167511.0, + "step": 19874 + }, + { + "epoch": 2.5283042869863888, + "grad_norm": 1.6250336170196533, + "learning_rate": 1e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.9010016918182373, + "num_tokens": 758202523.0, + "step": 19875 + }, + { + "epoch": 2.528431497264979, + "grad_norm": 1.4653260707855225, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8848031759262085, + "num_tokens": 758246441.0, + "step": 19876 + }, + { + "epoch": 2.5285587075435694, + "grad_norm": 1.623079776763916, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8874062895774841, + "num_tokens": 758283934.0, + "step": 19877 + }, + { + "epoch": 2.52868591782216, + "grad_norm": 1.7249020338058472, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8833907842636108, + "num_tokens": 758316411.0, + "step": 19878 + }, + { + "epoch": 2.5288131281007504, + "grad_norm": 1.5711376667022705, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8859807252883911, + "num_tokens": 758354719.0, + "step": 19879 + }, + { + "epoch": 2.528940338379341, + "grad_norm": 1.4828803539276123, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.892409086227417, + "num_tokens": 758393597.0, + "step": 19880 + }, + { + "epoch": 2.5290675486579315, + "grad_norm": 1.6446789503097534, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8763692378997803, + "num_tokens": 758429140.0, + "step": 19881 + }, + { + "epoch": 2.529194758936522, + "grad_norm": 1.6068613529205322, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8882803916931152, + "num_tokens": 758467097.0, + "step": 19882 + }, + { + "epoch": 2.5293219692151125, + "grad_norm": 1.5592398643493652, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8827533721923828, + "num_tokens": 758505156.0, + "step": 19883 + }, + { + "epoch": 2.529449179493703, + "grad_norm": 1.6532763242721558, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8861029148101807, + "num_tokens": 758541094.0, + "step": 19884 + }, + { + "epoch": 2.5295763897722936, + "grad_norm": 1.3798909187316895, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8913018703460693, + "num_tokens": 758588353.0, + "step": 19885 + }, + { + "epoch": 2.529703600050884, + "grad_norm": 1.654791235923767, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8830403089523315, + "num_tokens": 758629307.0, + "step": 19886 + }, + { + "epoch": 2.5298308103294747, + "grad_norm": 1.4140610694885254, + "learning_rate": 1e-06, + "loss": 0.2885, + "mean_token_accuracy": 0.8969846963882446, + "num_tokens": 758672599.0, + "step": 19887 + }, + { + "epoch": 2.529958020608065, + "grad_norm": 1.539324402809143, + "learning_rate": 1e-06, + "loss": 0.2706, + "mean_token_accuracy": 0.9020127058029175, + "num_tokens": 758706200.0, + "step": 19888 + }, + { + "epoch": 2.5300852308866557, + "grad_norm": 1.3981093168258667, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8872936964035034, + "num_tokens": 758749084.0, + "step": 19889 + }, + { + "epoch": 2.5302124411652462, + "grad_norm": 1.5564472675323486, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8938226103782654, + "num_tokens": 758784354.0, + "step": 19890 + }, + { + "epoch": 2.5303396514438368, + "grad_norm": 1.53836190700531, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.892998218536377, + "num_tokens": 758824458.0, + "step": 19891 + }, + { + "epoch": 2.5304668617224273, + "grad_norm": 1.5885483026504517, + "learning_rate": 1e-06, + "loss": 0.2797, + "mean_token_accuracy": 0.8988494873046875, + "num_tokens": 758858304.0, + "step": 19892 + }, + { + "epoch": 2.530594072001018, + "grad_norm": 1.6700574159622192, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8767395615577698, + "num_tokens": 758892886.0, + "step": 19893 + }, + { + "epoch": 2.5307212822796084, + "grad_norm": 1.5351632833480835, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8749732375144958, + "num_tokens": 758935426.0, + "step": 19894 + }, + { + "epoch": 2.5308484925581984, + "grad_norm": 1.476217269897461, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8859180212020874, + "num_tokens": 758974705.0, + "step": 19895 + }, + { + "epoch": 2.5309757028367894, + "grad_norm": 1.596182107925415, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8859909772872925, + "num_tokens": 759012545.0, + "step": 19896 + }, + { + "epoch": 2.5311029131153795, + "grad_norm": 1.5996259450912476, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8800032138824463, + "num_tokens": 759050657.0, + "step": 19897 + }, + { + "epoch": 2.5312301233939705, + "grad_norm": 1.6323338747024536, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8876583576202393, + "num_tokens": 759091605.0, + "step": 19898 + }, + { + "epoch": 2.5313573336725606, + "grad_norm": 1.57890784740448, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8808151483535767, + "num_tokens": 759130321.0, + "step": 19899 + }, + { + "epoch": 2.5314845439511515, + "grad_norm": 1.5172169208526611, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8835294842720032, + "num_tokens": 759172285.0, + "step": 19900 + }, + { + "epoch": 2.5316117542297416, + "grad_norm": 1.4660807847976685, + "learning_rate": 1e-06, + "loss": 0.2744, + "mean_token_accuracy": 0.9017313122749329, + "num_tokens": 759214952.0, + "step": 19901 + }, + { + "epoch": 2.531738964508332, + "grad_norm": 1.550508975982666, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.879565954208374, + "num_tokens": 759255542.0, + "step": 19902 + }, + { + "epoch": 2.5318661747869227, + "grad_norm": 1.6409317255020142, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8891599774360657, + "num_tokens": 759291260.0, + "step": 19903 + }, + { + "epoch": 2.531993385065513, + "grad_norm": 1.624750018119812, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8703227043151855, + "num_tokens": 759330316.0, + "step": 19904 + }, + { + "epoch": 2.5321205953441037, + "grad_norm": 1.8125377893447876, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8809130787849426, + "num_tokens": 759359667.0, + "step": 19905 + }, + { + "epoch": 2.5322478056226942, + "grad_norm": 1.5935755968093872, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8890441656112671, + "num_tokens": 759395646.0, + "step": 19906 + }, + { + "epoch": 2.5323750159012848, + "grad_norm": 1.8723149299621582, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8637056350708008, + "num_tokens": 759428762.0, + "step": 19907 + }, + { + "epoch": 2.5325022261798753, + "grad_norm": 1.6967841386795044, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.879340410232544, + "num_tokens": 759463320.0, + "step": 19908 + }, + { + "epoch": 2.532629436458466, + "grad_norm": 1.5061047077178955, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.884624719619751, + "num_tokens": 759501818.0, + "step": 19909 + }, + { + "epoch": 2.5327566467370564, + "grad_norm": 1.5788840055465698, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8900213837623596, + "num_tokens": 759542446.0, + "step": 19910 + }, + { + "epoch": 2.532883857015647, + "grad_norm": 1.5319212675094604, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8712956309318542, + "num_tokens": 759583148.0, + "step": 19911 + }, + { + "epoch": 2.5330110672942374, + "grad_norm": 1.6065865755081177, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8768384456634521, + "num_tokens": 759623594.0, + "step": 19912 + }, + { + "epoch": 2.533138277572828, + "grad_norm": 1.6182529926300049, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8813310265541077, + "num_tokens": 759663922.0, + "step": 19913 + }, + { + "epoch": 2.5332654878514185, + "grad_norm": 1.6048861742019653, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.880171000957489, + "num_tokens": 759702807.0, + "step": 19914 + }, + { + "epoch": 2.533392698130009, + "grad_norm": 1.921541452407837, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.875825047492981, + "num_tokens": 759731754.0, + "step": 19915 + }, + { + "epoch": 2.5335199084085995, + "grad_norm": 1.5848795175552368, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8748505115509033, + "num_tokens": 759769108.0, + "step": 19916 + }, + { + "epoch": 2.53364711868719, + "grad_norm": 1.5964385271072388, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8795528411865234, + "num_tokens": 759810582.0, + "step": 19917 + }, + { + "epoch": 2.5337743289657806, + "grad_norm": 1.5339094400405884, + "learning_rate": 1e-06, + "loss": 0.2646, + "mean_token_accuracy": 0.9026487469673157, + "num_tokens": 759851129.0, + "step": 19918 + }, + { + "epoch": 2.533901539244371, + "grad_norm": 1.5511060953140259, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8833173513412476, + "num_tokens": 759892664.0, + "step": 19919 + }, + { + "epoch": 2.534028749522961, + "grad_norm": 1.6260920763015747, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.879488468170166, + "num_tokens": 759931145.0, + "step": 19920 + }, + { + "epoch": 2.534155959801552, + "grad_norm": 1.522230625152588, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8811472654342651, + "num_tokens": 759971262.0, + "step": 19921 + }, + { + "epoch": 2.5342831700801423, + "grad_norm": 1.5942896604537964, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.894719123840332, + "num_tokens": 760004262.0, + "step": 19922 + }, + { + "epoch": 2.5344103803587332, + "grad_norm": 1.4674270153045654, + "learning_rate": 1e-06, + "loss": 0.2734, + "mean_token_accuracy": 0.8998437523841858, + "num_tokens": 760043575.0, + "step": 19923 + }, + { + "epoch": 2.5345375906373233, + "grad_norm": 1.518423318862915, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.8935824632644653, + "num_tokens": 760082196.0, + "step": 19924 + }, + { + "epoch": 2.534664800915914, + "grad_norm": 1.6321829557418823, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8872363567352295, + "num_tokens": 760115984.0, + "step": 19925 + }, + { + "epoch": 2.5347920111945044, + "grad_norm": 1.6018142700195312, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8844844102859497, + "num_tokens": 760153902.0, + "step": 19926 + }, + { + "epoch": 2.534919221473095, + "grad_norm": 1.7802547216415405, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8953779935836792, + "num_tokens": 760182654.0, + "step": 19927 + }, + { + "epoch": 2.5350464317516854, + "grad_norm": 1.5476703643798828, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8947017192840576, + "num_tokens": 760216746.0, + "step": 19928 + }, + { + "epoch": 2.535173642030276, + "grad_norm": 1.5983610153198242, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8752692937850952, + "num_tokens": 760259023.0, + "step": 19929 + }, + { + "epoch": 2.5353008523088665, + "grad_norm": 1.4558171033859253, + "learning_rate": 1e-06, + "loss": 0.2602, + "mean_token_accuracy": 0.9059499502182007, + "num_tokens": 760297646.0, + "step": 19930 + }, + { + "epoch": 2.535428062587457, + "grad_norm": 1.536599040031433, + "learning_rate": 1e-06, + "loss": 0.273, + "mean_token_accuracy": 0.8979924321174622, + "num_tokens": 760334298.0, + "step": 19931 + }, + { + "epoch": 2.5355552728660475, + "grad_norm": 1.5678011178970337, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8808788061141968, + "num_tokens": 760371228.0, + "step": 19932 + }, + { + "epoch": 2.535682483144638, + "grad_norm": 1.5317294597625732, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8829587697982788, + "num_tokens": 760409462.0, + "step": 19933 + }, + { + "epoch": 2.5358096934232286, + "grad_norm": 1.627958059310913, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8742278814315796, + "num_tokens": 760448604.0, + "step": 19934 + }, + { + "epoch": 2.535936903701819, + "grad_norm": 1.4422918558120728, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8969207406044006, + "num_tokens": 760490918.0, + "step": 19935 + }, + { + "epoch": 2.5360641139804097, + "grad_norm": 1.628691554069519, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8884612321853638, + "num_tokens": 760530111.0, + "step": 19936 + }, + { + "epoch": 2.536191324259, + "grad_norm": 1.3888095617294312, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8950662612915039, + "num_tokens": 760572736.0, + "step": 19937 + }, + { + "epoch": 2.5363185345375907, + "grad_norm": 1.530941128730774, + "learning_rate": 1e-06, + "loss": 0.2806, + "mean_token_accuracy": 0.8986725807189941, + "num_tokens": 760610013.0, + "step": 19938 + }, + { + "epoch": 2.5364457448161812, + "grad_norm": 1.4251161813735962, + "learning_rate": 1e-06, + "loss": 0.2885, + "mean_token_accuracy": 0.8941822052001953, + "num_tokens": 760651266.0, + "step": 19939 + }, + { + "epoch": 2.5365729550947718, + "grad_norm": 1.6321556568145752, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8804494738578796, + "num_tokens": 760690378.0, + "step": 19940 + }, + { + "epoch": 2.5367001653733623, + "grad_norm": 1.6724821329116821, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8807919025421143, + "num_tokens": 760731732.0, + "step": 19941 + }, + { + "epoch": 2.536827375651953, + "grad_norm": 1.6947945356369019, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8737595677375793, + "num_tokens": 760771431.0, + "step": 19942 + }, + { + "epoch": 2.5369545859305433, + "grad_norm": 1.8119877576828003, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8720607161521912, + "num_tokens": 760804279.0, + "step": 19943 + }, + { + "epoch": 2.537081796209134, + "grad_norm": 1.6394338607788086, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8821209669113159, + "num_tokens": 760839951.0, + "step": 19944 + }, + { + "epoch": 2.537209006487724, + "grad_norm": 1.6926151514053345, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8792338371276855, + "num_tokens": 760876337.0, + "step": 19945 + }, + { + "epoch": 2.537336216766315, + "grad_norm": 1.5096218585968018, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8843812346458435, + "num_tokens": 760918529.0, + "step": 19946 + }, + { + "epoch": 2.537463427044905, + "grad_norm": 1.5770761966705322, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8876235485076904, + "num_tokens": 760959211.0, + "step": 19947 + }, + { + "epoch": 2.537590637323496, + "grad_norm": 1.4777979850769043, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8838515281677246, + "num_tokens": 760999323.0, + "step": 19948 + }, + { + "epoch": 2.537717847602086, + "grad_norm": 1.621962070465088, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8869373798370361, + "num_tokens": 761033358.0, + "step": 19949 + }, + { + "epoch": 2.5378450578806766, + "grad_norm": 1.4707354307174683, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8914901614189148, + "num_tokens": 761077684.0, + "step": 19950 + }, + { + "epoch": 2.537972268159267, + "grad_norm": 1.6086739301681519, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.883660078048706, + "num_tokens": 761112127.0, + "step": 19951 + }, + { + "epoch": 2.5380994784378577, + "grad_norm": 1.720556616783142, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8846290111541748, + "num_tokens": 761143655.0, + "step": 19952 + }, + { + "epoch": 2.538226688716448, + "grad_norm": 1.6604539155960083, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8877958655357361, + "num_tokens": 761176432.0, + "step": 19953 + }, + { + "epoch": 2.5383538989950387, + "grad_norm": 1.5749003887176514, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8910083174705505, + "num_tokens": 761212717.0, + "step": 19954 + }, + { + "epoch": 2.5384811092736292, + "grad_norm": 1.386940360069275, + "learning_rate": 1e-06, + "loss": 0.2805, + "mean_token_accuracy": 0.8989689350128174, + "num_tokens": 761254240.0, + "step": 19955 + }, + { + "epoch": 2.5386083195522198, + "grad_norm": 1.4691894054412842, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8894903659820557, + "num_tokens": 761295703.0, + "step": 19956 + }, + { + "epoch": 2.5387355298308103, + "grad_norm": 1.6611396074295044, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8774422407150269, + "num_tokens": 761332285.0, + "step": 19957 + }, + { + "epoch": 2.538862740109401, + "grad_norm": 1.6417967081069946, + "learning_rate": 1e-06, + "loss": 0.284, + "mean_token_accuracy": 0.8966959714889526, + "num_tokens": 761366686.0, + "step": 19958 + }, + { + "epoch": 2.5389899503879914, + "grad_norm": 1.6286507844924927, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8669208884239197, + "num_tokens": 761404664.0, + "step": 19959 + }, + { + "epoch": 2.539117160666582, + "grad_norm": 1.6411516666412354, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8679572343826294, + "num_tokens": 761441595.0, + "step": 19960 + }, + { + "epoch": 2.5392443709451724, + "grad_norm": 1.5689181089401245, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8941432237625122, + "num_tokens": 761480730.0, + "step": 19961 + }, + { + "epoch": 2.539371581223763, + "grad_norm": 1.663700819015503, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8604435324668884, + "num_tokens": 761517963.0, + "step": 19962 + }, + { + "epoch": 2.5394987915023535, + "grad_norm": 1.632559895515442, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8772749900817871, + "num_tokens": 761555014.0, + "step": 19963 + }, + { + "epoch": 2.539626001780944, + "grad_norm": 1.56141996383667, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8859030604362488, + "num_tokens": 761594131.0, + "step": 19964 + }, + { + "epoch": 2.5397532120595345, + "grad_norm": 1.6876826286315918, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8816314339637756, + "num_tokens": 761630197.0, + "step": 19965 + }, + { + "epoch": 2.539880422338125, + "grad_norm": 1.6469825506210327, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8852762579917908, + "num_tokens": 761663044.0, + "step": 19966 + }, + { + "epoch": 2.5400076326167156, + "grad_norm": 1.5435726642608643, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8781206011772156, + "num_tokens": 761707374.0, + "step": 19967 + }, + { + "epoch": 2.5401348428953057, + "grad_norm": 1.7091630697250366, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8642660975456238, + "num_tokens": 761746308.0, + "step": 19968 + }, + { + "epoch": 2.5402620531738966, + "grad_norm": 1.6592626571655273, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8833455443382263, + "num_tokens": 761783826.0, + "step": 19969 + }, + { + "epoch": 2.5403892634524867, + "grad_norm": 1.8223283290863037, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.890012264251709, + "num_tokens": 761814801.0, + "step": 19970 + }, + { + "epoch": 2.5405164737310777, + "grad_norm": 1.6920987367630005, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.883117139339447, + "num_tokens": 761848046.0, + "step": 19971 + }, + { + "epoch": 2.540643684009668, + "grad_norm": 1.632417917251587, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8806618452072144, + "num_tokens": 761884005.0, + "step": 19972 + }, + { + "epoch": 2.5407708942882588, + "grad_norm": 1.5899326801300049, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8889007568359375, + "num_tokens": 761919603.0, + "step": 19973 + }, + { + "epoch": 2.540898104566849, + "grad_norm": 1.6149332523345947, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8780918121337891, + "num_tokens": 761956963.0, + "step": 19974 + }, + { + "epoch": 2.5410253148454394, + "grad_norm": 1.6214802265167236, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8871694803237915, + "num_tokens": 761993898.0, + "step": 19975 + }, + { + "epoch": 2.54115252512403, + "grad_norm": 1.6154662370681763, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8872480392456055, + "num_tokens": 762029836.0, + "step": 19976 + }, + { + "epoch": 2.5412797354026204, + "grad_norm": 1.5737848281860352, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8778521418571472, + "num_tokens": 762069210.0, + "step": 19977 + }, + { + "epoch": 2.541406945681211, + "grad_norm": 1.6026870012283325, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8961539268493652, + "num_tokens": 762106110.0, + "step": 19978 + }, + { + "epoch": 2.5415341559598015, + "grad_norm": 1.5216609239578247, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8924264907836914, + "num_tokens": 762143880.0, + "step": 19979 + }, + { + "epoch": 2.541661366238392, + "grad_norm": 1.4853157997131348, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8925912380218506, + "num_tokens": 762183264.0, + "step": 19980 + }, + { + "epoch": 2.5417885765169825, + "grad_norm": 1.4718363285064697, + "learning_rate": 1e-06, + "loss": 0.2548, + "mean_token_accuracy": 0.9077773690223694, + "num_tokens": 762222405.0, + "step": 19981 + }, + { + "epoch": 2.541915786795573, + "grad_norm": 1.5007259845733643, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8826228380203247, + "num_tokens": 762262021.0, + "step": 19982 + }, + { + "epoch": 2.5420429970741636, + "grad_norm": 1.430782675743103, + "learning_rate": 1e-06, + "loss": 0.249, + "mean_token_accuracy": 0.9076446294784546, + "num_tokens": 762297119.0, + "step": 19983 + }, + { + "epoch": 2.542170207352754, + "grad_norm": 1.5106439590454102, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8756335377693176, + "num_tokens": 762338259.0, + "step": 19984 + }, + { + "epoch": 2.5422974176313446, + "grad_norm": 1.6188842058181763, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8857482671737671, + "num_tokens": 762377355.0, + "step": 19985 + }, + { + "epoch": 2.542424627909935, + "grad_norm": 1.5484378337860107, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.893247127532959, + "num_tokens": 762412590.0, + "step": 19986 + }, + { + "epoch": 2.5425518381885257, + "grad_norm": 1.5885316133499146, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8839297890663147, + "num_tokens": 762452357.0, + "step": 19987 + }, + { + "epoch": 2.5426790484671162, + "grad_norm": 1.5726168155670166, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.8976975679397583, + "num_tokens": 762486683.0, + "step": 19988 + }, + { + "epoch": 2.5428062587457068, + "grad_norm": 1.6748772859573364, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8710629940032959, + "num_tokens": 762525990.0, + "step": 19989 + }, + { + "epoch": 2.5429334690242973, + "grad_norm": 1.5312561988830566, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.879122257232666, + "num_tokens": 762568638.0, + "step": 19990 + }, + { + "epoch": 2.543060679302888, + "grad_norm": 1.6129934787750244, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8957870006561279, + "num_tokens": 762602503.0, + "step": 19991 + }, + { + "epoch": 2.5431878895814783, + "grad_norm": 1.4911447763442993, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8865115642547607, + "num_tokens": 762645895.0, + "step": 19992 + }, + { + "epoch": 2.5433150998600684, + "grad_norm": 1.6932083368301392, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8897115588188171, + "num_tokens": 762682324.0, + "step": 19993 + }, + { + "epoch": 2.5434423101386594, + "grad_norm": 1.6578795909881592, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8756157755851746, + "num_tokens": 762720746.0, + "step": 19994 + }, + { + "epoch": 2.5435695204172495, + "grad_norm": 1.7560032606124878, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8729197978973389, + "num_tokens": 762756689.0, + "step": 19995 + }, + { + "epoch": 2.5436967306958405, + "grad_norm": 1.771744966506958, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8783335089683533, + "num_tokens": 762788308.0, + "step": 19996 + }, + { + "epoch": 2.5438239409744305, + "grad_norm": 1.6998894214630127, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8834613561630249, + "num_tokens": 762822519.0, + "step": 19997 + }, + { + "epoch": 2.5439511512530215, + "grad_norm": 1.4795300960540771, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8963552713394165, + "num_tokens": 762863048.0, + "step": 19998 + }, + { + "epoch": 2.5440783615316116, + "grad_norm": 1.5244367122650146, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8800449371337891, + "num_tokens": 762900599.0, + "step": 19999 + }, + { + "epoch": 2.544205571810202, + "grad_norm": 1.6446051597595215, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8803128004074097, + "num_tokens": 762939328.0, + "step": 20000 + }, + { + "epoch": 2.5443327820887927, + "grad_norm": 1.4128795862197876, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8953968286514282, + "num_tokens": 762983680.0, + "step": 20001 + }, + { + "epoch": 2.544459992367383, + "grad_norm": 1.5409194231033325, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8943358063697815, + "num_tokens": 763021736.0, + "step": 20002 + }, + { + "epoch": 2.5445872026459737, + "grad_norm": 1.5805060863494873, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8849239945411682, + "num_tokens": 763058555.0, + "step": 20003 + }, + { + "epoch": 2.5447144129245642, + "grad_norm": 1.626409888267517, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8908487558364868, + "num_tokens": 763090622.0, + "step": 20004 + }, + { + "epoch": 2.5448416232031548, + "grad_norm": 1.485676884651184, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8848389387130737, + "num_tokens": 763131046.0, + "step": 20005 + }, + { + "epoch": 2.5449688334817453, + "grad_norm": 1.6680033206939697, + "learning_rate": 1e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.8935219049453735, + "num_tokens": 763162926.0, + "step": 20006 + }, + { + "epoch": 2.545096043760336, + "grad_norm": 1.5310355424880981, + "learning_rate": 1e-06, + "loss": 0.287, + "mean_token_accuracy": 0.8953045010566711, + "num_tokens": 763198326.0, + "step": 20007 + }, + { + "epoch": 2.5452232540389264, + "grad_norm": 1.5151335000991821, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.881125807762146, + "num_tokens": 763237144.0, + "step": 20008 + }, + { + "epoch": 2.545350464317517, + "grad_norm": 1.5887291431427002, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8918159008026123, + "num_tokens": 763272703.0, + "step": 20009 + }, + { + "epoch": 2.5454776745961074, + "grad_norm": 1.5660545825958252, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8772569894790649, + "num_tokens": 763313000.0, + "step": 20010 + }, + { + "epoch": 2.545604884874698, + "grad_norm": 1.641222357749939, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8658416271209717, + "num_tokens": 763351951.0, + "step": 20011 + }, + { + "epoch": 2.5457320951532885, + "grad_norm": 1.4827888011932373, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.893447756767273, + "num_tokens": 763389604.0, + "step": 20012 + }, + { + "epoch": 2.545859305431879, + "grad_norm": 1.5156147480010986, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8899756073951721, + "num_tokens": 763425996.0, + "step": 20013 + }, + { + "epoch": 2.5459865157104695, + "grad_norm": 1.5405993461608887, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8882925510406494, + "num_tokens": 763463191.0, + "step": 20014 + }, + { + "epoch": 2.54611372598906, + "grad_norm": 1.575930118560791, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8818603754043579, + "num_tokens": 763504066.0, + "step": 20015 + }, + { + "epoch": 2.5462409362676506, + "grad_norm": 1.779083251953125, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8693158626556396, + "num_tokens": 763543761.0, + "step": 20016 + }, + { + "epoch": 2.546368146546241, + "grad_norm": 1.553385615348816, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8826054334640503, + "num_tokens": 763583937.0, + "step": 20017 + }, + { + "epoch": 2.546495356824831, + "grad_norm": 1.590320348739624, + "learning_rate": 1e-06, + "loss": 0.276, + "mean_token_accuracy": 0.8996511697769165, + "num_tokens": 763619212.0, + "step": 20018 + }, + { + "epoch": 2.546622567103422, + "grad_norm": 1.5369312763214111, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8865306377410889, + "num_tokens": 763658975.0, + "step": 20019 + }, + { + "epoch": 2.5467497773820122, + "grad_norm": 1.627397060394287, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8804207444190979, + "num_tokens": 763697080.0, + "step": 20020 + }, + { + "epoch": 2.546876987660603, + "grad_norm": 1.4243111610412598, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8899859189987183, + "num_tokens": 763738077.0, + "step": 20021 + }, + { + "epoch": 2.5470041979391933, + "grad_norm": 1.6360067129135132, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8871605396270752, + "num_tokens": 763773134.0, + "step": 20022 + }, + { + "epoch": 2.547131408217784, + "grad_norm": 1.5192639827728271, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8895719647407532, + "num_tokens": 763810882.0, + "step": 20023 + }, + { + "epoch": 2.5472586184963744, + "grad_norm": 1.7641059160232544, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8827660083770752, + "num_tokens": 763847556.0, + "step": 20024 + }, + { + "epoch": 2.547385828774965, + "grad_norm": 1.616653323173523, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8940632343292236, + "num_tokens": 763883301.0, + "step": 20025 + }, + { + "epoch": 2.5475130390535554, + "grad_norm": 1.7064917087554932, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8753747940063477, + "num_tokens": 763919638.0, + "step": 20026 + }, + { + "epoch": 2.547640249332146, + "grad_norm": 1.663225769996643, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8670229911804199, + "num_tokens": 763958439.0, + "step": 20027 + }, + { + "epoch": 2.5477674596107365, + "grad_norm": 1.5620077848434448, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8844485282897949, + "num_tokens": 763998371.0, + "step": 20028 + }, + { + "epoch": 2.547894669889327, + "grad_norm": 1.6738978624343872, + "learning_rate": 1e-06, + "loss": 0.2726, + "mean_token_accuracy": 0.9008330702781677, + "num_tokens": 764027726.0, + "step": 20029 + }, + { + "epoch": 2.5480218801679175, + "grad_norm": 1.5443741083145142, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8965110182762146, + "num_tokens": 764064911.0, + "step": 20030 + }, + { + "epoch": 2.548149090446508, + "grad_norm": 1.4959498643875122, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8810789585113525, + "num_tokens": 764110126.0, + "step": 20031 + }, + { + "epoch": 2.5482763007250986, + "grad_norm": 1.6236813068389893, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8747694492340088, + "num_tokens": 764148300.0, + "step": 20032 + }, + { + "epoch": 2.548403511003689, + "grad_norm": 1.4366246461868286, + "learning_rate": 1e-06, + "loss": 0.2733, + "mean_token_accuracy": 0.8994688987731934, + "num_tokens": 764187087.0, + "step": 20033 + }, + { + "epoch": 2.5485307212822796, + "grad_norm": 1.4770941734313965, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.8939570188522339, + "num_tokens": 764228265.0, + "step": 20034 + }, + { + "epoch": 2.54865793156087, + "grad_norm": 1.6170212030410767, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8858120441436768, + "num_tokens": 764263385.0, + "step": 20035 + }, + { + "epoch": 2.5487851418394607, + "grad_norm": 1.5105013847351074, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.888807475566864, + "num_tokens": 764303385.0, + "step": 20036 + }, + { + "epoch": 2.5489123521180512, + "grad_norm": 1.730432152748108, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8783676624298096, + "num_tokens": 764337669.0, + "step": 20037 + }, + { + "epoch": 2.5490395623966418, + "grad_norm": 1.5546306371688843, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8733831644058228, + "num_tokens": 764378248.0, + "step": 20038 + }, + { + "epoch": 2.5491667726752323, + "grad_norm": 1.6456799507141113, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.877705454826355, + "num_tokens": 764416884.0, + "step": 20039 + }, + { + "epoch": 2.549293982953823, + "grad_norm": 1.585292935371399, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8805770874023438, + "num_tokens": 764457715.0, + "step": 20040 + }, + { + "epoch": 2.5494211932324133, + "grad_norm": 1.4274545907974243, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8849785923957825, + "num_tokens": 764504263.0, + "step": 20041 + }, + { + "epoch": 2.549548403511004, + "grad_norm": 1.6096537113189697, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8788456916809082, + "num_tokens": 764543392.0, + "step": 20042 + }, + { + "epoch": 2.549675613789594, + "grad_norm": 1.608780026435852, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8881773948669434, + "num_tokens": 764581284.0, + "step": 20043 + }, + { + "epoch": 2.549802824068185, + "grad_norm": 1.7960797548294067, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8672266602516174, + "num_tokens": 764614152.0, + "step": 20044 + }, + { + "epoch": 2.549930034346775, + "grad_norm": 1.4216142892837524, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8821218013763428, + "num_tokens": 764660961.0, + "step": 20045 + }, + { + "epoch": 2.550057244625366, + "grad_norm": 1.5389951467514038, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8910192251205444, + "num_tokens": 764699429.0, + "step": 20046 + }, + { + "epoch": 2.550184454903956, + "grad_norm": 1.5403140783309937, + "learning_rate": 1e-06, + "loss": 0.2816, + "mean_token_accuracy": 0.8977224826812744, + "num_tokens": 764734087.0, + "step": 20047 + }, + { + "epoch": 2.5503116651825466, + "grad_norm": 1.8440033197402954, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8754026889801025, + "num_tokens": 764765975.0, + "step": 20048 + }, + { + "epoch": 2.550438875461137, + "grad_norm": 1.3451721668243408, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.8948314189910889, + "num_tokens": 764809259.0, + "step": 20049 + }, + { + "epoch": 2.5505660857397277, + "grad_norm": 1.4630377292633057, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8776353001594543, + "num_tokens": 764852999.0, + "step": 20050 + }, + { + "epoch": 2.550693296018318, + "grad_norm": 1.58284592628479, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8790642619132996, + "num_tokens": 764892276.0, + "step": 20051 + }, + { + "epoch": 2.5508205062969087, + "grad_norm": 1.5780112743377686, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8842583894729614, + "num_tokens": 764935386.0, + "step": 20052 + }, + { + "epoch": 2.5509477165754992, + "grad_norm": 1.618552565574646, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8901029825210571, + "num_tokens": 764969223.0, + "step": 20053 + }, + { + "epoch": 2.5510749268540898, + "grad_norm": 1.3631607294082642, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8920104503631592, + "num_tokens": 765014734.0, + "step": 20054 + }, + { + "epoch": 2.5512021371326803, + "grad_norm": 1.5335904359817505, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8866418600082397, + "num_tokens": 765053950.0, + "step": 20055 + }, + { + "epoch": 2.551329347411271, + "grad_norm": 1.5596152544021606, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8905388712882996, + "num_tokens": 765092907.0, + "step": 20056 + }, + { + "epoch": 2.5514565576898613, + "grad_norm": 1.5674030780792236, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8663747310638428, + "num_tokens": 765136343.0, + "step": 20057 + }, + { + "epoch": 2.551583767968452, + "grad_norm": 1.5840009450912476, + "learning_rate": 1e-06, + "loss": 0.2778, + "mean_token_accuracy": 0.9004608392715454, + "num_tokens": 765172283.0, + "step": 20058 + }, + { + "epoch": 2.5517109782470424, + "grad_norm": 1.4955419301986694, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8958573937416077, + "num_tokens": 765212191.0, + "step": 20059 + }, + { + "epoch": 2.551838188525633, + "grad_norm": 1.590201735496521, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8726820945739746, + "num_tokens": 765250938.0, + "step": 20060 + }, + { + "epoch": 2.5519653988042235, + "grad_norm": 1.6954265832901, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8781158328056335, + "num_tokens": 765287370.0, + "step": 20061 + }, + { + "epoch": 2.552092609082814, + "grad_norm": 1.5456713438034058, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.876327395439148, + "num_tokens": 765329689.0, + "step": 20062 + }, + { + "epoch": 2.5522198193614045, + "grad_norm": 1.5989593267440796, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8883883357048035, + "num_tokens": 765365054.0, + "step": 20063 + }, + { + "epoch": 2.552347029639995, + "grad_norm": 1.6921952962875366, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8904759883880615, + "num_tokens": 765394991.0, + "step": 20064 + }, + { + "epoch": 2.5524742399185856, + "grad_norm": 1.4975364208221436, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8868209719657898, + "num_tokens": 765433854.0, + "step": 20065 + }, + { + "epoch": 2.5526014501971757, + "grad_norm": 1.6078929901123047, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8851644396781921, + "num_tokens": 765469572.0, + "step": 20066 + }, + { + "epoch": 2.5527286604757666, + "grad_norm": 1.5992741584777832, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8800543546676636, + "num_tokens": 765508587.0, + "step": 20067 + }, + { + "epoch": 2.5528558707543567, + "grad_norm": 1.6393221616744995, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8826251029968262, + "num_tokens": 765548313.0, + "step": 20068 + }, + { + "epoch": 2.5529830810329477, + "grad_norm": 1.4041985273361206, + "learning_rate": 1e-06, + "loss": 0.2828, + "mean_token_accuracy": 0.8981668949127197, + "num_tokens": 765591390.0, + "step": 20069 + }, + { + "epoch": 2.5531102913115378, + "grad_norm": 1.510006070137024, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.887903094291687, + "num_tokens": 765633012.0, + "step": 20070 + }, + { + "epoch": 2.5532375015901287, + "grad_norm": 1.4531432390213013, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8861512541770935, + "num_tokens": 765675174.0, + "step": 20071 + }, + { + "epoch": 2.553364711868719, + "grad_norm": 1.7068010568618774, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8841567039489746, + "num_tokens": 765706745.0, + "step": 20072 + }, + { + "epoch": 2.5534919221473094, + "grad_norm": 1.6639710664749146, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8689923882484436, + "num_tokens": 765741968.0, + "step": 20073 + }, + { + "epoch": 2.5536191324259, + "grad_norm": 1.5973998308181763, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8883627653121948, + "num_tokens": 765779511.0, + "step": 20074 + }, + { + "epoch": 2.5537463427044904, + "grad_norm": 1.6045252084732056, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8753691911697388, + "num_tokens": 765817936.0, + "step": 20075 + }, + { + "epoch": 2.553873552983081, + "grad_norm": 1.4421967267990112, + "learning_rate": 1e-06, + "loss": 0.2705, + "mean_token_accuracy": 0.9015979170799255, + "num_tokens": 765855576.0, + "step": 20076 + }, + { + "epoch": 2.5540007632616715, + "grad_norm": 2.932003974914551, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8726341724395752, + "num_tokens": 765893714.0, + "step": 20077 + }, + { + "epoch": 2.554127973540262, + "grad_norm": 1.5108264684677124, + "learning_rate": 1e-06, + "loss": 0.2909, + "mean_token_accuracy": 0.8952102065086365, + "num_tokens": 765932334.0, + "step": 20078 + }, + { + "epoch": 2.5542551838188525, + "grad_norm": 1.6183003187179565, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.879228413105011, + "num_tokens": 765968692.0, + "step": 20079 + }, + { + "epoch": 2.554382394097443, + "grad_norm": 1.3771694898605347, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8945571780204773, + "num_tokens": 766013308.0, + "step": 20080 + }, + { + "epoch": 2.5545096043760336, + "grad_norm": 1.5362187623977661, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8910759687423706, + "num_tokens": 766051037.0, + "step": 20081 + }, + { + "epoch": 2.554636814654624, + "grad_norm": 1.4279253482818604, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8877348303794861, + "num_tokens": 766098821.0, + "step": 20082 + }, + { + "epoch": 2.5547640249332146, + "grad_norm": 1.546924352645874, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.87715744972229, + "num_tokens": 766142032.0, + "step": 20083 + }, + { + "epoch": 2.554891235211805, + "grad_norm": 1.4995896816253662, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8828542232513428, + "num_tokens": 766182627.0, + "step": 20084 + }, + { + "epoch": 2.5550184454903957, + "grad_norm": 1.6875795125961304, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8830801248550415, + "num_tokens": 766220831.0, + "step": 20085 + }, + { + "epoch": 2.5551456557689862, + "grad_norm": 1.6655664443969727, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.891217052936554, + "num_tokens": 766254520.0, + "step": 20086 + }, + { + "epoch": 2.5552728660475768, + "grad_norm": 1.5306462049484253, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8918533325195312, + "num_tokens": 766292668.0, + "step": 20087 + }, + { + "epoch": 2.5554000763261673, + "grad_norm": 1.5176746845245361, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8854686617851257, + "num_tokens": 766332601.0, + "step": 20088 + }, + { + "epoch": 2.555527286604758, + "grad_norm": 1.554457187652588, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8807573318481445, + "num_tokens": 766370909.0, + "step": 20089 + }, + { + "epoch": 2.5556544968833483, + "grad_norm": 1.4943807125091553, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8891449570655823, + "num_tokens": 766409025.0, + "step": 20090 + }, + { + "epoch": 2.5557817071619384, + "grad_norm": 1.665788173675537, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8934162259101868, + "num_tokens": 766445664.0, + "step": 20091 + }, + { + "epoch": 2.5559089174405294, + "grad_norm": 1.5489286184310913, + "learning_rate": 1e-06, + "loss": 0.2705, + "mean_token_accuracy": 0.9022689461708069, + "num_tokens": 766482612.0, + "step": 20092 + }, + { + "epoch": 2.5560361277191195, + "grad_norm": 1.5612014532089233, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8902175426483154, + "num_tokens": 766519500.0, + "step": 20093 + }, + { + "epoch": 2.5561633379977104, + "grad_norm": 1.6913199424743652, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8889983296394348, + "num_tokens": 766551056.0, + "step": 20094 + }, + { + "epoch": 2.5562905482763005, + "grad_norm": 1.4830248355865479, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8913716673851013, + "num_tokens": 766591017.0, + "step": 20095 + }, + { + "epoch": 2.5564177585548915, + "grad_norm": 1.5928183794021606, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.889436662197113, + "num_tokens": 766629093.0, + "step": 20096 + }, + { + "epoch": 2.5565449688334816, + "grad_norm": 1.5248429775238037, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8804791569709778, + "num_tokens": 766669022.0, + "step": 20097 + }, + { + "epoch": 2.556672179112072, + "grad_norm": 1.4543793201446533, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8866133689880371, + "num_tokens": 766709269.0, + "step": 20098 + }, + { + "epoch": 2.5567993893906626, + "grad_norm": 1.4735803604125977, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8953630328178406, + "num_tokens": 766748081.0, + "step": 20099 + }, + { + "epoch": 2.556926599669253, + "grad_norm": 1.6817660331726074, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8752238750457764, + "num_tokens": 766782285.0, + "step": 20100 + }, + { + "epoch": 2.5570538099478437, + "grad_norm": 1.491062045097351, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8942103981971741, + "num_tokens": 766820389.0, + "step": 20101 + }, + { + "epoch": 2.5571810202264342, + "grad_norm": 1.6288912296295166, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8862334489822388, + "num_tokens": 766857391.0, + "step": 20102 + }, + { + "epoch": 2.5573082305050248, + "grad_norm": 1.4334526062011719, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8805335760116577, + "num_tokens": 766902301.0, + "step": 20103 + }, + { + "epoch": 2.5574354407836153, + "grad_norm": 1.488621711730957, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8801040053367615, + "num_tokens": 766945729.0, + "step": 20104 + }, + { + "epoch": 2.557562651062206, + "grad_norm": 1.711487054824829, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8796733021736145, + "num_tokens": 766980761.0, + "step": 20105 + }, + { + "epoch": 2.5576898613407963, + "grad_norm": 1.4979342222213745, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8912395238876343, + "num_tokens": 767018398.0, + "step": 20106 + }, + { + "epoch": 2.557817071619387, + "grad_norm": 1.3616135120391846, + "learning_rate": 1e-06, + "loss": 0.2636, + "mean_token_accuracy": 0.9033340215682983, + "num_tokens": 767062614.0, + "step": 20107 + }, + { + "epoch": 2.5579442818979774, + "grad_norm": 1.5277591943740845, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8836426138877869, + "num_tokens": 767102404.0, + "step": 20108 + }, + { + "epoch": 2.558071492176568, + "grad_norm": 1.50124192237854, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8877966403961182, + "num_tokens": 767143259.0, + "step": 20109 + }, + { + "epoch": 2.5581987024551585, + "grad_norm": 1.5038400888442993, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8888506889343262, + "num_tokens": 767180179.0, + "step": 20110 + }, + { + "epoch": 2.558325912733749, + "grad_norm": 1.5886434316635132, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8850226998329163, + "num_tokens": 767216944.0, + "step": 20111 + }, + { + "epoch": 2.5584531230123395, + "grad_norm": 1.4994367361068726, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8900911808013916, + "num_tokens": 767256226.0, + "step": 20112 + }, + { + "epoch": 2.55858033329093, + "grad_norm": 1.519818902015686, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8804171085357666, + "num_tokens": 767299386.0, + "step": 20113 + }, + { + "epoch": 2.5587075435695206, + "grad_norm": 1.4224892854690552, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8929787278175354, + "num_tokens": 767342539.0, + "step": 20114 + }, + { + "epoch": 2.558834753848111, + "grad_norm": 1.525549054145813, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8873944878578186, + "num_tokens": 767381148.0, + "step": 20115 + }, + { + "epoch": 2.558961964126701, + "grad_norm": 1.6070929765701294, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8940617442131042, + "num_tokens": 767416537.0, + "step": 20116 + }, + { + "epoch": 2.559089174405292, + "grad_norm": 1.5891869068145752, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8785240650177002, + "num_tokens": 767453037.0, + "step": 20117 + }, + { + "epoch": 2.5592163846838822, + "grad_norm": 1.6687220335006714, + "learning_rate": 1e-06, + "loss": 0.2772, + "mean_token_accuracy": 0.9019876718521118, + "num_tokens": 767482621.0, + "step": 20118 + }, + { + "epoch": 2.559343594962473, + "grad_norm": 1.6040834188461304, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8868721723556519, + "num_tokens": 767521365.0, + "step": 20119 + }, + { + "epoch": 2.5594708052410633, + "grad_norm": 1.6737498044967651, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8795793056488037, + "num_tokens": 767557475.0, + "step": 20120 + }, + { + "epoch": 2.559598015519654, + "grad_norm": 1.5151035785675049, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.888932466506958, + "num_tokens": 767598039.0, + "step": 20121 + }, + { + "epoch": 2.5597252257982444, + "grad_norm": 1.5980439186096191, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8896251320838928, + "num_tokens": 767635414.0, + "step": 20122 + }, + { + "epoch": 2.559852436076835, + "grad_norm": 1.6578304767608643, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8668180704116821, + "num_tokens": 767673792.0, + "step": 20123 + }, + { + "epoch": 2.5599796463554254, + "grad_norm": 1.5040515661239624, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8885094523429871, + "num_tokens": 767713909.0, + "step": 20124 + }, + { + "epoch": 2.560106856634016, + "grad_norm": 1.7048920392990112, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8910887241363525, + "num_tokens": 767748447.0, + "step": 20125 + }, + { + "epoch": 2.5602340669126065, + "grad_norm": 1.654855728149414, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8813978433609009, + "num_tokens": 767787424.0, + "step": 20126 + }, + { + "epoch": 2.560361277191197, + "grad_norm": 1.687624454498291, + "learning_rate": 1e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.8960349559783936, + "num_tokens": 767817688.0, + "step": 20127 + }, + { + "epoch": 2.5604884874697875, + "grad_norm": 1.5524516105651855, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8775990605354309, + "num_tokens": 767865298.0, + "step": 20128 + }, + { + "epoch": 2.560615697748378, + "grad_norm": 1.6502071619033813, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8831945657730103, + "num_tokens": 767900838.0, + "step": 20129 + }, + { + "epoch": 2.5607429080269686, + "grad_norm": 1.6935899257659912, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8834548592567444, + "num_tokens": 767933341.0, + "step": 20130 + }, + { + "epoch": 2.560870118305559, + "grad_norm": 1.6614189147949219, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8901643753051758, + "num_tokens": 767969184.0, + "step": 20131 + }, + { + "epoch": 2.5609973285841496, + "grad_norm": 1.6039104461669922, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8902125358581543, + "num_tokens": 768006976.0, + "step": 20132 + }, + { + "epoch": 2.56112453886274, + "grad_norm": 1.6874940395355225, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8752246499061584, + "num_tokens": 768045992.0, + "step": 20133 + }, + { + "epoch": 2.5612517491413307, + "grad_norm": 1.5466148853302002, + "learning_rate": 1e-06, + "loss": 0.2849, + "mean_token_accuracy": 0.8997830152511597, + "num_tokens": 768080184.0, + "step": 20134 + }, + { + "epoch": 2.561378959419921, + "grad_norm": 1.6314117908477783, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8904811143875122, + "num_tokens": 768114190.0, + "step": 20135 + }, + { + "epoch": 2.5615061696985117, + "grad_norm": 1.5117528438568115, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8852365016937256, + "num_tokens": 768155033.0, + "step": 20136 + }, + { + "epoch": 2.5616333799771023, + "grad_norm": 1.7888481616973877, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8726819753646851, + "num_tokens": 768187610.0, + "step": 20137 + }, + { + "epoch": 2.561760590255693, + "grad_norm": 1.4698928594589233, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8891427516937256, + "num_tokens": 768226898.0, + "step": 20138 + }, + { + "epoch": 2.5618878005342833, + "grad_norm": 1.5886682271957397, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8737754225730896, + "num_tokens": 768263990.0, + "step": 20139 + }, + { + "epoch": 2.562015010812874, + "grad_norm": 1.7700494527816772, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.861260712146759, + "num_tokens": 768297201.0, + "step": 20140 + }, + { + "epoch": 2.562142221091464, + "grad_norm": 1.3674627542495728, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8908805847167969, + "num_tokens": 768341508.0, + "step": 20141 + }, + { + "epoch": 2.562269431370055, + "grad_norm": 1.6577589511871338, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8876606822013855, + "num_tokens": 768373580.0, + "step": 20142 + }, + { + "epoch": 2.562396641648645, + "grad_norm": 1.5036437511444092, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8907867670059204, + "num_tokens": 768414567.0, + "step": 20143 + }, + { + "epoch": 2.562523851927236, + "grad_norm": 1.5307180881500244, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8840872645378113, + "num_tokens": 768454871.0, + "step": 20144 + }, + { + "epoch": 2.562651062205826, + "grad_norm": 1.6208205223083496, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8778009414672852, + "num_tokens": 768492539.0, + "step": 20145 + }, + { + "epoch": 2.5627782724844166, + "grad_norm": 1.6198111772537231, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8768975734710693, + "num_tokens": 768529270.0, + "step": 20146 + }, + { + "epoch": 2.562905482763007, + "grad_norm": 1.5553914308547974, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8748513460159302, + "num_tokens": 768572520.0, + "step": 20147 + }, + { + "epoch": 2.5630326930415976, + "grad_norm": 1.5902568101882935, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8864854574203491, + "num_tokens": 768612366.0, + "step": 20148 + }, + { + "epoch": 2.563159903320188, + "grad_norm": 1.500137209892273, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8824286460876465, + "num_tokens": 768653300.0, + "step": 20149 + }, + { + "epoch": 2.5632871135987787, + "grad_norm": 1.6168251037597656, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.873685359954834, + "num_tokens": 768692244.0, + "step": 20150 + }, + { + "epoch": 2.5634143238773692, + "grad_norm": 1.4470940828323364, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8814975619316101, + "num_tokens": 768735126.0, + "step": 20151 + }, + { + "epoch": 2.5635415341559598, + "grad_norm": 1.5152136087417603, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.8922897577285767, + "num_tokens": 768771230.0, + "step": 20152 + }, + { + "epoch": 2.5636687444345503, + "grad_norm": 1.4997117519378662, + "learning_rate": 1e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.8973857164382935, + "num_tokens": 768810324.0, + "step": 20153 + }, + { + "epoch": 2.563795954713141, + "grad_norm": 1.5639647245407104, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8910894393920898, + "num_tokens": 768845772.0, + "step": 20154 + }, + { + "epoch": 2.5639231649917313, + "grad_norm": 1.6001704931259155, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8801960349082947, + "num_tokens": 768882667.0, + "step": 20155 + }, + { + "epoch": 2.564050375270322, + "grad_norm": 1.5808038711547852, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8896706700325012, + "num_tokens": 768915887.0, + "step": 20156 + }, + { + "epoch": 2.5641775855489124, + "grad_norm": 1.5233973264694214, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8912568092346191, + "num_tokens": 768954614.0, + "step": 20157 + }, + { + "epoch": 2.564304795827503, + "grad_norm": 1.655758023262024, + "learning_rate": 1e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.89677494764328, + "num_tokens": 768989679.0, + "step": 20158 + }, + { + "epoch": 2.5644320061060935, + "grad_norm": 1.693888545036316, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8733083009719849, + "num_tokens": 769025335.0, + "step": 20159 + }, + { + "epoch": 2.564559216384684, + "grad_norm": 1.4522995948791504, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8896295428276062, + "num_tokens": 769063962.0, + "step": 20160 + }, + { + "epoch": 2.5646864266632745, + "grad_norm": 1.4720908403396606, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8818638324737549, + "num_tokens": 769105204.0, + "step": 20161 + }, + { + "epoch": 2.564813636941865, + "grad_norm": 1.6677320003509521, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8799000978469849, + "num_tokens": 769140697.0, + "step": 20162 + }, + { + "epoch": 2.5649408472204556, + "grad_norm": 1.5162701606750488, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8787057399749756, + "num_tokens": 769181743.0, + "step": 20163 + }, + { + "epoch": 2.5650680574990457, + "grad_norm": 1.6924264430999756, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8810863494873047, + "num_tokens": 769214954.0, + "step": 20164 + }, + { + "epoch": 2.5651952677776366, + "grad_norm": 1.542542815208435, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8894726037979126, + "num_tokens": 769250169.0, + "step": 20165 + }, + { + "epoch": 2.5653224780562267, + "grad_norm": 1.5199222564697266, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8913125395774841, + "num_tokens": 769287451.0, + "step": 20166 + }, + { + "epoch": 2.5654496883348177, + "grad_norm": 1.5407596826553345, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8737783432006836, + "num_tokens": 769325227.0, + "step": 20167 + }, + { + "epoch": 2.5655768986134078, + "grad_norm": 1.4515661001205444, + "learning_rate": 1e-06, + "loss": 0.2849, + "mean_token_accuracy": 0.8959769010543823, + "num_tokens": 769367145.0, + "step": 20168 + }, + { + "epoch": 2.5657041088919987, + "grad_norm": 1.4829312562942505, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8911935687065125, + "num_tokens": 769403956.0, + "step": 20169 + }, + { + "epoch": 2.565831319170589, + "grad_norm": 1.6873332262039185, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8870261907577515, + "num_tokens": 769442604.0, + "step": 20170 + }, + { + "epoch": 2.5659585294491793, + "grad_norm": 1.7085126638412476, + "learning_rate": 1e-06, + "loss": 0.2724, + "mean_token_accuracy": 0.8991456031799316, + "num_tokens": 769474717.0, + "step": 20171 + }, + { + "epoch": 2.56608573972777, + "grad_norm": 1.56045401096344, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.8927468061447144, + "num_tokens": 769507880.0, + "step": 20172 + }, + { + "epoch": 2.5662129500063604, + "grad_norm": 1.5045632123947144, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8839410543441772, + "num_tokens": 769547292.0, + "step": 20173 + }, + { + "epoch": 2.566340160284951, + "grad_norm": 1.6430728435516357, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8821636438369751, + "num_tokens": 769585500.0, + "step": 20174 + }, + { + "epoch": 2.5664673705635415, + "grad_norm": 1.5839377641677856, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8942798972129822, + "num_tokens": 769620016.0, + "step": 20175 + }, + { + "epoch": 2.566594580842132, + "grad_norm": 1.5554475784301758, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8922069072723389, + "num_tokens": 769659628.0, + "step": 20176 + }, + { + "epoch": 2.5667217911207225, + "grad_norm": 1.463200330734253, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8889042139053345, + "num_tokens": 769699779.0, + "step": 20177 + }, + { + "epoch": 2.566849001399313, + "grad_norm": 1.5625793933868408, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.889066219329834, + "num_tokens": 769736301.0, + "step": 20178 + }, + { + "epoch": 2.5669762116779036, + "grad_norm": 1.4506572484970093, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8745067119598389, + "num_tokens": 769781273.0, + "step": 20179 + }, + { + "epoch": 2.567103421956494, + "grad_norm": 1.6665725708007812, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8777226209640503, + "num_tokens": 769815051.0, + "step": 20180 + }, + { + "epoch": 2.5672306322350846, + "grad_norm": 1.5899242162704468, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8851471543312073, + "num_tokens": 769848917.0, + "step": 20181 + }, + { + "epoch": 2.567357842513675, + "grad_norm": 1.5855616331100464, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8817214369773865, + "num_tokens": 769886851.0, + "step": 20182 + }, + { + "epoch": 2.5674850527922657, + "grad_norm": 1.399520993232727, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8924184441566467, + "num_tokens": 769931457.0, + "step": 20183 + }, + { + "epoch": 2.567612263070856, + "grad_norm": 1.4926033020019531, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8874155879020691, + "num_tokens": 769969445.0, + "step": 20184 + }, + { + "epoch": 2.5677394733494467, + "grad_norm": 1.524194359779358, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8855928182601929, + "num_tokens": 770012673.0, + "step": 20185 + }, + { + "epoch": 2.5678666836280373, + "grad_norm": 1.616608738899231, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8829158544540405, + "num_tokens": 770053691.0, + "step": 20186 + }, + { + "epoch": 2.567993893906628, + "grad_norm": 1.531450629234314, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8881173133850098, + "num_tokens": 770091461.0, + "step": 20187 + }, + { + "epoch": 2.5681211041852183, + "grad_norm": 1.552870750427246, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8968501091003418, + "num_tokens": 770131864.0, + "step": 20188 + }, + { + "epoch": 2.5682483144638084, + "grad_norm": 1.659281849861145, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8818644285202026, + "num_tokens": 770168253.0, + "step": 20189 + }, + { + "epoch": 2.5683755247423994, + "grad_norm": 1.544638991355896, + "learning_rate": 1e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.8970428705215454, + "num_tokens": 770203174.0, + "step": 20190 + }, + { + "epoch": 2.5685027350209895, + "grad_norm": 1.3827219009399414, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8873629570007324, + "num_tokens": 770248108.0, + "step": 20191 + }, + { + "epoch": 2.5686299452995804, + "grad_norm": 1.742319941520691, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8817184567451477, + "num_tokens": 770282823.0, + "step": 20192 + }, + { + "epoch": 2.5687571555781705, + "grad_norm": 1.6555185317993164, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8892402648925781, + "num_tokens": 770316655.0, + "step": 20193 + }, + { + "epoch": 2.5688843658567615, + "grad_norm": 1.628267765045166, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8833582401275635, + "num_tokens": 770352797.0, + "step": 20194 + }, + { + "epoch": 2.5690115761353516, + "grad_norm": 1.6635369062423706, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8840380907058716, + "num_tokens": 770385578.0, + "step": 20195 + }, + { + "epoch": 2.569138786413942, + "grad_norm": 1.5289441347122192, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.897006094455719, + "num_tokens": 770422318.0, + "step": 20196 + }, + { + "epoch": 2.5692659966925326, + "grad_norm": 1.5960416793823242, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8869037628173828, + "num_tokens": 770458176.0, + "step": 20197 + }, + { + "epoch": 2.569393206971123, + "grad_norm": 1.662376880645752, + "learning_rate": 1e-06, + "loss": 0.2801, + "mean_token_accuracy": 0.8984465599060059, + "num_tokens": 770489982.0, + "step": 20198 + }, + { + "epoch": 2.5695204172497137, + "grad_norm": 1.4503159523010254, + "learning_rate": 1e-06, + "loss": 0.2893, + "mean_token_accuracy": 0.8929229974746704, + "num_tokens": 770529113.0, + "step": 20199 + }, + { + "epoch": 2.5696476275283042, + "grad_norm": 1.5089695453643799, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8921190500259399, + "num_tokens": 770568458.0, + "step": 20200 + }, + { + "epoch": 2.5697748378068948, + "grad_norm": 1.662702202796936, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8816080689430237, + "num_tokens": 770608195.0, + "step": 20201 + }, + { + "epoch": 2.5699020480854853, + "grad_norm": 1.5089408159255981, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8859996795654297, + "num_tokens": 770644513.0, + "step": 20202 + }, + { + "epoch": 2.570029258364076, + "grad_norm": 1.6911433935165405, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.88611900806427, + "num_tokens": 770680105.0, + "step": 20203 + }, + { + "epoch": 2.5701564686426663, + "grad_norm": 1.6268751621246338, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8897417187690735, + "num_tokens": 770717752.0, + "step": 20204 + }, + { + "epoch": 2.570283678921257, + "grad_norm": 1.5992634296417236, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8953291773796082, + "num_tokens": 770756999.0, + "step": 20205 + }, + { + "epoch": 2.5704108891998474, + "grad_norm": 1.4361841678619385, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8900102376937866, + "num_tokens": 770802534.0, + "step": 20206 + }, + { + "epoch": 2.570538099478438, + "grad_norm": 1.7177042961120605, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8713839650154114, + "num_tokens": 770838280.0, + "step": 20207 + }, + { + "epoch": 2.5706653097570284, + "grad_norm": 1.7375459671020508, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8791242837905884, + "num_tokens": 770871516.0, + "step": 20208 + }, + { + "epoch": 2.570792520035619, + "grad_norm": 1.5871261358261108, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8883318901062012, + "num_tokens": 770907799.0, + "step": 20209 + }, + { + "epoch": 2.5709197303142095, + "grad_norm": 1.6106115579605103, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8825322389602661, + "num_tokens": 770946514.0, + "step": 20210 + }, + { + "epoch": 2.5710469405928, + "grad_norm": 1.505354404449463, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8908291459083557, + "num_tokens": 770984568.0, + "step": 20211 + }, + { + "epoch": 2.5711741508713906, + "grad_norm": 1.686217188835144, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8821698427200317, + "num_tokens": 771017917.0, + "step": 20212 + }, + { + "epoch": 2.571301361149981, + "grad_norm": 1.3559730052947998, + "learning_rate": 1e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.8927320837974548, + "num_tokens": 771061388.0, + "step": 20213 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 1.522560954093933, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8794568777084351, + "num_tokens": 771103146.0, + "step": 20214 + }, + { + "epoch": 2.571555781707162, + "grad_norm": 1.6027705669403076, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8873016834259033, + "num_tokens": 771138815.0, + "step": 20215 + }, + { + "epoch": 2.5716829919857522, + "grad_norm": 1.5152348279953003, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8833559155464172, + "num_tokens": 771179752.0, + "step": 20216 + }, + { + "epoch": 2.571810202264343, + "grad_norm": 1.7387871742248535, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8772408366203308, + "num_tokens": 771212877.0, + "step": 20217 + }, + { + "epoch": 2.5719374125429333, + "grad_norm": 1.5497466325759888, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8689570426940918, + "num_tokens": 771256429.0, + "step": 20218 + }, + { + "epoch": 2.572064622821524, + "grad_norm": 1.4307552576065063, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8858193159103394, + "num_tokens": 771300717.0, + "step": 20219 + }, + { + "epoch": 2.5721918331001143, + "grad_norm": 1.5683718919754028, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8770295977592468, + "num_tokens": 771336901.0, + "step": 20220 + }, + { + "epoch": 2.572319043378705, + "grad_norm": 1.5445998907089233, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8869409561157227, + "num_tokens": 771376322.0, + "step": 20221 + }, + { + "epoch": 2.5724462536572954, + "grad_norm": 1.6842906475067139, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8800201416015625, + "num_tokens": 771415188.0, + "step": 20222 + }, + { + "epoch": 2.572573463935886, + "grad_norm": 1.6390539407730103, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8786987066268921, + "num_tokens": 771455730.0, + "step": 20223 + }, + { + "epoch": 2.5727006742144765, + "grad_norm": 1.5034130811691284, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8924132585525513, + "num_tokens": 771494189.0, + "step": 20224 + }, + { + "epoch": 2.572827884493067, + "grad_norm": 1.4302093982696533, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8872342705726624, + "num_tokens": 771537306.0, + "step": 20225 + }, + { + "epoch": 2.5729550947716575, + "grad_norm": 1.596301555633545, + "learning_rate": 1e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.8962624073028564, + "num_tokens": 771571099.0, + "step": 20226 + }, + { + "epoch": 2.573082305050248, + "grad_norm": 1.583203673362732, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8791469931602478, + "num_tokens": 771611565.0, + "step": 20227 + }, + { + "epoch": 2.5732095153288386, + "grad_norm": 1.523219347000122, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8729335069656372, + "num_tokens": 771655753.0, + "step": 20228 + }, + { + "epoch": 2.573336725607429, + "grad_norm": 1.6148076057434082, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8799023628234863, + "num_tokens": 771699034.0, + "step": 20229 + }, + { + "epoch": 2.5734639358860196, + "grad_norm": 1.6466999053955078, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8695313334465027, + "num_tokens": 771739292.0, + "step": 20230 + }, + { + "epoch": 2.57359114616461, + "grad_norm": 1.507124423980713, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8915163278579712, + "num_tokens": 771778911.0, + "step": 20231 + }, + { + "epoch": 2.5737183564432007, + "grad_norm": 1.440071940422058, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8952857255935669, + "num_tokens": 771816359.0, + "step": 20232 + }, + { + "epoch": 2.573845566721791, + "grad_norm": 1.4767096042633057, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8833012580871582, + "num_tokens": 771857477.0, + "step": 20233 + }, + { + "epoch": 2.5739727770003817, + "grad_norm": 1.4372199773788452, + "learning_rate": 1e-06, + "loss": 0.2599, + "mean_token_accuracy": 0.9039263725280762, + "num_tokens": 771894162.0, + "step": 20234 + }, + { + "epoch": 2.5740999872789723, + "grad_norm": 1.5774484872817993, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8775224685668945, + "num_tokens": 771932371.0, + "step": 20235 + }, + { + "epoch": 2.574227197557563, + "grad_norm": 1.4992115497589111, + "learning_rate": 1e-06, + "loss": 0.2725, + "mean_token_accuracy": 0.8994031548500061, + "num_tokens": 771970483.0, + "step": 20236 + }, + { + "epoch": 2.5743544078361533, + "grad_norm": 1.6192748546600342, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8956171274185181, + "num_tokens": 772004184.0, + "step": 20237 + }, + { + "epoch": 2.574481618114744, + "grad_norm": 1.6196622848510742, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8786796927452087, + "num_tokens": 772041515.0, + "step": 20238 + }, + { + "epoch": 2.574608828393334, + "grad_norm": 1.7166426181793213, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.873712420463562, + "num_tokens": 772079975.0, + "step": 20239 + }, + { + "epoch": 2.574736038671925, + "grad_norm": 1.541861891746521, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8873146772384644, + "num_tokens": 772120135.0, + "step": 20240 + }, + { + "epoch": 2.574863248950515, + "grad_norm": 1.5250866413116455, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.887507438659668, + "num_tokens": 772160746.0, + "step": 20241 + }, + { + "epoch": 2.574990459229106, + "grad_norm": 1.5018233060836792, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8916342258453369, + "num_tokens": 772200100.0, + "step": 20242 + }, + { + "epoch": 2.575117669507696, + "grad_norm": 1.5702807903289795, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8856006860733032, + "num_tokens": 772238390.0, + "step": 20243 + }, + { + "epoch": 2.5752448797862866, + "grad_norm": 1.4385066032409668, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8816189765930176, + "num_tokens": 772282955.0, + "step": 20244 + }, + { + "epoch": 2.575372090064877, + "grad_norm": 1.4525667428970337, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.8972282409667969, + "num_tokens": 772323760.0, + "step": 20245 + }, + { + "epoch": 2.5754993003434676, + "grad_norm": 1.5818238258361816, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.883635401725769, + "num_tokens": 772363248.0, + "step": 20246 + }, + { + "epoch": 2.575626510622058, + "grad_norm": 1.5686112642288208, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8952385783195496, + "num_tokens": 772401481.0, + "step": 20247 + }, + { + "epoch": 2.5757537209006487, + "grad_norm": 1.5549390316009521, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8852367401123047, + "num_tokens": 772439742.0, + "step": 20248 + }, + { + "epoch": 2.575880931179239, + "grad_norm": 1.663676142692566, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8845280408859253, + "num_tokens": 772473531.0, + "step": 20249 + }, + { + "epoch": 2.5760081414578297, + "grad_norm": 1.5406193733215332, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8846028447151184, + "num_tokens": 772512480.0, + "step": 20250 + }, + { + "epoch": 2.5761353517364203, + "grad_norm": 1.5366156101226807, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8896439671516418, + "num_tokens": 772551513.0, + "step": 20251 + }, + { + "epoch": 2.576262562015011, + "grad_norm": 1.4885354042053223, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8934829235076904, + "num_tokens": 772590934.0, + "step": 20252 + }, + { + "epoch": 2.5763897722936013, + "grad_norm": 1.6112664937973022, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8903768658638, + "num_tokens": 772626037.0, + "step": 20253 + }, + { + "epoch": 2.576516982572192, + "grad_norm": 1.5053839683532715, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8932772874832153, + "num_tokens": 772666940.0, + "step": 20254 + }, + { + "epoch": 2.5766441928507824, + "grad_norm": 1.7054120302200317, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8939083814620972, + "num_tokens": 772698794.0, + "step": 20255 + }, + { + "epoch": 2.576771403129373, + "grad_norm": 1.404935598373413, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8923854231834412, + "num_tokens": 772741915.0, + "step": 20256 + }, + { + "epoch": 2.5768986134079634, + "grad_norm": 1.6699174642562866, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8858113884925842, + "num_tokens": 772774657.0, + "step": 20257 + }, + { + "epoch": 2.577025823686554, + "grad_norm": 1.5519170761108398, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8751294612884521, + "num_tokens": 772814130.0, + "step": 20258 + }, + { + "epoch": 2.5771530339651445, + "grad_norm": 1.7391103506088257, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8722122311592102, + "num_tokens": 772856788.0, + "step": 20259 + }, + { + "epoch": 2.577280244243735, + "grad_norm": 1.5648722648620605, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8895848989486694, + "num_tokens": 772893699.0, + "step": 20260 + }, + { + "epoch": 2.5774074545223256, + "grad_norm": 1.714174747467041, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8682243824005127, + "num_tokens": 772929970.0, + "step": 20261 + }, + { + "epoch": 2.5775346648009156, + "grad_norm": 1.5828702449798584, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.888002336025238, + "num_tokens": 772968264.0, + "step": 20262 + }, + { + "epoch": 2.5776618750795066, + "grad_norm": 1.7148925065994263, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8746693730354309, + "num_tokens": 773001211.0, + "step": 20263 + }, + { + "epoch": 2.5777890853580967, + "grad_norm": 1.671161413192749, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8769590854644775, + "num_tokens": 773038124.0, + "step": 20264 + }, + { + "epoch": 2.5779162956366877, + "grad_norm": 1.6874725818634033, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8815090656280518, + "num_tokens": 773072324.0, + "step": 20265 + }, + { + "epoch": 2.5780435059152778, + "grad_norm": 1.6531336307525635, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8772439360618591, + "num_tokens": 773105415.0, + "step": 20266 + }, + { + "epoch": 2.5781707161938687, + "grad_norm": 1.509584903717041, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.876548171043396, + "num_tokens": 773148202.0, + "step": 20267 + }, + { + "epoch": 2.578297926472459, + "grad_norm": 1.6241490840911865, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8818092942237854, + "num_tokens": 773183496.0, + "step": 20268 + }, + { + "epoch": 2.5784251367510493, + "grad_norm": 1.568660020828247, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8759579658508301, + "num_tokens": 773224559.0, + "step": 20269 + }, + { + "epoch": 2.57855234702964, + "grad_norm": 1.5822614431381226, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8844670057296753, + "num_tokens": 773263092.0, + "step": 20270 + }, + { + "epoch": 2.5786795573082304, + "grad_norm": 1.464869499206543, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8756494522094727, + "num_tokens": 773306865.0, + "step": 20271 + }, + { + "epoch": 2.578806767586821, + "grad_norm": 1.4566075801849365, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8785293698310852, + "num_tokens": 773351703.0, + "step": 20272 + }, + { + "epoch": 2.5789339778654115, + "grad_norm": 1.6025986671447754, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8868153095245361, + "num_tokens": 773390047.0, + "step": 20273 + }, + { + "epoch": 2.579061188144002, + "grad_norm": 1.384260892868042, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8863037824630737, + "num_tokens": 773432722.0, + "step": 20274 + }, + { + "epoch": 2.5791883984225925, + "grad_norm": 1.4942364692687988, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8824412226676941, + "num_tokens": 773472611.0, + "step": 20275 + }, + { + "epoch": 2.579315608701183, + "grad_norm": 1.6170192956924438, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8815551996231079, + "num_tokens": 773511884.0, + "step": 20276 + }, + { + "epoch": 2.5794428189797736, + "grad_norm": 1.6793452501296997, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.876979649066925, + "num_tokens": 773547318.0, + "step": 20277 + }, + { + "epoch": 2.579570029258364, + "grad_norm": 1.6179709434509277, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8786658048629761, + "num_tokens": 773584052.0, + "step": 20278 + }, + { + "epoch": 2.5796972395369546, + "grad_norm": 1.6505409479141235, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8816741108894348, + "num_tokens": 773617936.0, + "step": 20279 + }, + { + "epoch": 2.579824449815545, + "grad_norm": 1.5217671394348145, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8742601871490479, + "num_tokens": 773661214.0, + "step": 20280 + }, + { + "epoch": 2.5799516600941357, + "grad_norm": 1.4312748908996582, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8903990387916565, + "num_tokens": 773702390.0, + "step": 20281 + }, + { + "epoch": 2.580078870372726, + "grad_norm": 1.5420113801956177, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8903542160987854, + "num_tokens": 773738417.0, + "step": 20282 + }, + { + "epoch": 2.5802060806513167, + "grad_norm": 1.5183497667312622, + "learning_rate": 1e-06, + "loss": 0.2769, + "mean_token_accuracy": 0.9009437561035156, + "num_tokens": 773774579.0, + "step": 20283 + }, + { + "epoch": 2.5803332909299073, + "grad_norm": 1.6535769701004028, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8882752656936646, + "num_tokens": 773809961.0, + "step": 20284 + }, + { + "epoch": 2.580460501208498, + "grad_norm": 1.5520660877227783, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.877888560295105, + "num_tokens": 773852714.0, + "step": 20285 + }, + { + "epoch": 2.5805877114870883, + "grad_norm": 1.5729594230651855, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8907727003097534, + "num_tokens": 773887201.0, + "step": 20286 + }, + { + "epoch": 2.5807149217656784, + "grad_norm": 1.699625015258789, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8929773569107056, + "num_tokens": 773917707.0, + "step": 20287 + }, + { + "epoch": 2.5808421320442694, + "grad_norm": 1.5995137691497803, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8789095282554626, + "num_tokens": 773954273.0, + "step": 20288 + }, + { + "epoch": 2.5809693423228595, + "grad_norm": 1.4721423387527466, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8946014642715454, + "num_tokens": 773997916.0, + "step": 20289 + }, + { + "epoch": 2.5810965526014504, + "grad_norm": 1.597966194152832, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8827289342880249, + "num_tokens": 774035128.0, + "step": 20290 + }, + { + "epoch": 2.5812237628800405, + "grad_norm": 1.6359518766403198, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8702081441879272, + "num_tokens": 774071951.0, + "step": 20291 + }, + { + "epoch": 2.5813509731586315, + "grad_norm": 1.6765316724777222, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8835110068321228, + "num_tokens": 774102947.0, + "step": 20292 + }, + { + "epoch": 2.5814781834372216, + "grad_norm": 1.5150219202041626, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8933974504470825, + "num_tokens": 774141929.0, + "step": 20293 + }, + { + "epoch": 2.581605393715812, + "grad_norm": 1.6151880025863647, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.88033127784729, + "num_tokens": 774181342.0, + "step": 20294 + }, + { + "epoch": 2.5817326039944026, + "grad_norm": 1.5992584228515625, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8825167417526245, + "num_tokens": 774217342.0, + "step": 20295 + }, + { + "epoch": 2.581859814272993, + "grad_norm": 1.5515998601913452, + "learning_rate": 1e-06, + "loss": 0.2781, + "mean_token_accuracy": 0.8973478078842163, + "num_tokens": 774254683.0, + "step": 20296 + }, + { + "epoch": 2.5819870245515837, + "grad_norm": 1.4393773078918457, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8800273537635803, + "num_tokens": 774302116.0, + "step": 20297 + }, + { + "epoch": 2.582114234830174, + "grad_norm": 1.6521598100662231, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8888319730758667, + "num_tokens": 774339321.0, + "step": 20298 + }, + { + "epoch": 2.5822414451087647, + "grad_norm": 1.5780742168426514, + "learning_rate": 1e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.8972963690757751, + "num_tokens": 774374677.0, + "step": 20299 + }, + { + "epoch": 2.5823686553873553, + "grad_norm": 1.602621078491211, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8738580346107483, + "num_tokens": 774417729.0, + "step": 20300 + }, + { + "epoch": 2.582495865665946, + "grad_norm": 1.7530258893966675, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8878503441810608, + "num_tokens": 774449375.0, + "step": 20301 + }, + { + "epoch": 2.5826230759445363, + "grad_norm": 1.6259547472000122, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8925856351852417, + "num_tokens": 774481533.0, + "step": 20302 + }, + { + "epoch": 2.582750286223127, + "grad_norm": 1.5455540418624878, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8884443044662476, + "num_tokens": 774520526.0, + "step": 20303 + }, + { + "epoch": 2.5828774965017174, + "grad_norm": 1.4839211702346802, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8842452764511108, + "num_tokens": 774559594.0, + "step": 20304 + }, + { + "epoch": 2.583004706780308, + "grad_norm": 1.7368419170379639, + "learning_rate": 1e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.8975381255149841, + "num_tokens": 774591278.0, + "step": 20305 + }, + { + "epoch": 2.5831319170588984, + "grad_norm": 1.434127688407898, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.877288818359375, + "num_tokens": 774639477.0, + "step": 20306 + }, + { + "epoch": 2.583259127337489, + "grad_norm": 1.6808065176010132, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8836788535118103, + "num_tokens": 774673235.0, + "step": 20307 + }, + { + "epoch": 2.5833863376160795, + "grad_norm": 1.553568720817566, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8759857416152954, + "num_tokens": 774713538.0, + "step": 20308 + }, + { + "epoch": 2.58351354789467, + "grad_norm": 1.5734227895736694, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.88640958070755, + "num_tokens": 774750603.0, + "step": 20309 + }, + { + "epoch": 2.5836407581732606, + "grad_norm": 1.4992449283599854, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8942003846168518, + "num_tokens": 774791812.0, + "step": 20310 + }, + { + "epoch": 2.583767968451851, + "grad_norm": 1.4896278381347656, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8885817527770996, + "num_tokens": 774833452.0, + "step": 20311 + }, + { + "epoch": 2.583895178730441, + "grad_norm": 1.4988372325897217, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8894220590591431, + "num_tokens": 774875126.0, + "step": 20312 + }, + { + "epoch": 2.584022389009032, + "grad_norm": 1.5819182395935059, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8850282430648804, + "num_tokens": 774913240.0, + "step": 20313 + }, + { + "epoch": 2.5841495992876222, + "grad_norm": 1.5630372762680054, + "learning_rate": 1e-06, + "loss": 0.274, + "mean_token_accuracy": 0.9017511606216431, + "num_tokens": 774946404.0, + "step": 20314 + }, + { + "epoch": 2.584276809566213, + "grad_norm": 1.4978208541870117, + "learning_rate": 1e-06, + "loss": 0.2682, + "mean_token_accuracy": 0.9020561575889587, + "num_tokens": 774983613.0, + "step": 20315 + }, + { + "epoch": 2.5844040198448033, + "grad_norm": 1.6160557270050049, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8951822519302368, + "num_tokens": 775019242.0, + "step": 20316 + }, + { + "epoch": 2.584531230123394, + "grad_norm": 1.6542829275131226, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8920069336891174, + "num_tokens": 775050610.0, + "step": 20317 + }, + { + "epoch": 2.5846584404019843, + "grad_norm": 1.564674735069275, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8823713660240173, + "num_tokens": 775087880.0, + "step": 20318 + }, + { + "epoch": 2.584785650680575, + "grad_norm": 1.5835281610488892, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8739614486694336, + "num_tokens": 775126912.0, + "step": 20319 + }, + { + "epoch": 2.5849128609591654, + "grad_norm": 1.5920698642730713, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8922038674354553, + "num_tokens": 775160157.0, + "step": 20320 + }, + { + "epoch": 2.585040071237756, + "grad_norm": 1.4952231645584106, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8937190771102905, + "num_tokens": 775198693.0, + "step": 20321 + }, + { + "epoch": 2.5851672815163464, + "grad_norm": 1.5848246812820435, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8945976495742798, + "num_tokens": 775234717.0, + "step": 20322 + }, + { + "epoch": 2.585294491794937, + "grad_norm": 1.5070202350616455, + "learning_rate": 1e-06, + "loss": 0.2833, + "mean_token_accuracy": 0.8963890075683594, + "num_tokens": 775271455.0, + "step": 20323 + }, + { + "epoch": 2.5854217020735275, + "grad_norm": 1.6555380821228027, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8822231888771057, + "num_tokens": 775308476.0, + "step": 20324 + }, + { + "epoch": 2.585548912352118, + "grad_norm": 1.6403173208236694, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8874850273132324, + "num_tokens": 775346618.0, + "step": 20325 + }, + { + "epoch": 2.5856761226307086, + "grad_norm": 1.3673477172851562, + "learning_rate": 1e-06, + "loss": 0.2624, + "mean_token_accuracy": 0.9058427214622498, + "num_tokens": 775386523.0, + "step": 20326 + }, + { + "epoch": 2.585803332909299, + "grad_norm": 1.5488024950027466, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8864725828170776, + "num_tokens": 775425724.0, + "step": 20327 + }, + { + "epoch": 2.5859305431878896, + "grad_norm": 1.713902473449707, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8850657343864441, + "num_tokens": 775462811.0, + "step": 20328 + }, + { + "epoch": 2.58605775346648, + "grad_norm": 1.4807422161102295, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8892760276794434, + "num_tokens": 775504966.0, + "step": 20329 + }, + { + "epoch": 2.5861849637450707, + "grad_norm": 1.6258141994476318, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8764748573303223, + "num_tokens": 775539709.0, + "step": 20330 + }, + { + "epoch": 2.586312174023661, + "grad_norm": 1.725722312927246, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8960670232772827, + "num_tokens": 775572710.0, + "step": 20331 + }, + { + "epoch": 2.5864393843022517, + "grad_norm": 1.6528786420822144, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8944374918937683, + "num_tokens": 775607663.0, + "step": 20332 + }, + { + "epoch": 2.5865665945808423, + "grad_norm": 1.5906957387924194, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.884033739566803, + "num_tokens": 775644656.0, + "step": 20333 + }, + { + "epoch": 2.586693804859433, + "grad_norm": 1.549688696861267, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8802622556686401, + "num_tokens": 775684173.0, + "step": 20334 + }, + { + "epoch": 2.5868210151380233, + "grad_norm": 1.4112648963928223, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.896609902381897, + "num_tokens": 775727151.0, + "step": 20335 + }, + { + "epoch": 2.586948225416614, + "grad_norm": 1.648457407951355, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8818150758743286, + "num_tokens": 775759485.0, + "step": 20336 + }, + { + "epoch": 2.587075435695204, + "grad_norm": 1.5389126539230347, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.882906436920166, + "num_tokens": 775799013.0, + "step": 20337 + }, + { + "epoch": 2.587202645973795, + "grad_norm": 1.6392980813980103, + "learning_rate": 1e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.8996658325195312, + "num_tokens": 775833132.0, + "step": 20338 + }, + { + "epoch": 2.587329856252385, + "grad_norm": 1.5692603588104248, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8785295486450195, + "num_tokens": 775870627.0, + "step": 20339 + }, + { + "epoch": 2.587457066530976, + "grad_norm": 1.534033179283142, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8938419222831726, + "num_tokens": 775905684.0, + "step": 20340 + }, + { + "epoch": 2.587584276809566, + "grad_norm": 1.5162736177444458, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8776271343231201, + "num_tokens": 775946044.0, + "step": 20341 + }, + { + "epoch": 2.5877114870881566, + "grad_norm": 1.4664063453674316, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8917703628540039, + "num_tokens": 775984862.0, + "step": 20342 + }, + { + "epoch": 2.587838697366747, + "grad_norm": 1.5632036924362183, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8938077688217163, + "num_tokens": 776019147.0, + "step": 20343 + }, + { + "epoch": 2.5879659076453376, + "grad_norm": 1.5606275796890259, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8886786103248596, + "num_tokens": 776058872.0, + "step": 20344 + }, + { + "epoch": 2.588093117923928, + "grad_norm": 1.6369655132293701, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8890962600708008, + "num_tokens": 776097453.0, + "step": 20345 + }, + { + "epoch": 2.5882203282025187, + "grad_norm": 1.8675272464752197, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8847582340240479, + "num_tokens": 776134494.0, + "step": 20346 + }, + { + "epoch": 2.588347538481109, + "grad_norm": 1.5599116086959839, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8802810907363892, + "num_tokens": 776172987.0, + "step": 20347 + }, + { + "epoch": 2.5884747487596997, + "grad_norm": 1.5477979183197021, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8903871178627014, + "num_tokens": 776208613.0, + "step": 20348 + }, + { + "epoch": 2.5886019590382903, + "grad_norm": 1.6792922019958496, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8750762939453125, + "num_tokens": 776245709.0, + "step": 20349 + }, + { + "epoch": 2.588729169316881, + "grad_norm": 1.6341077089309692, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8969193696975708, + "num_tokens": 776281043.0, + "step": 20350 + }, + { + "epoch": 2.5888563795954713, + "grad_norm": 1.5317014455795288, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8842031955718994, + "num_tokens": 776323232.0, + "step": 20351 + }, + { + "epoch": 2.588983589874062, + "grad_norm": 1.5011931657791138, + "learning_rate": 1e-06, + "loss": 0.2846, + "mean_token_accuracy": 0.8974278569221497, + "num_tokens": 776363885.0, + "step": 20352 + }, + { + "epoch": 2.5891108001526524, + "grad_norm": 1.6286382675170898, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8818720579147339, + "num_tokens": 776401406.0, + "step": 20353 + }, + { + "epoch": 2.589238010431243, + "grad_norm": 1.4494438171386719, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8901329636573792, + "num_tokens": 776443330.0, + "step": 20354 + }, + { + "epoch": 2.5893652207098334, + "grad_norm": 1.65609872341156, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8825885653495789, + "num_tokens": 776480290.0, + "step": 20355 + }, + { + "epoch": 2.589492430988424, + "grad_norm": 1.6427518129348755, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8895488977432251, + "num_tokens": 776513967.0, + "step": 20356 + }, + { + "epoch": 2.5896196412670145, + "grad_norm": 1.4453493356704712, + "learning_rate": 1e-06, + "loss": 0.2765, + "mean_token_accuracy": 0.8981337547302246, + "num_tokens": 776553573.0, + "step": 20357 + }, + { + "epoch": 2.589746851545605, + "grad_norm": 1.6450979709625244, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8857298493385315, + "num_tokens": 776589702.0, + "step": 20358 + }, + { + "epoch": 2.5898740618241956, + "grad_norm": 1.4815739393234253, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8928253650665283, + "num_tokens": 776631284.0, + "step": 20359 + }, + { + "epoch": 2.5900012721027856, + "grad_norm": 1.6823301315307617, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8774054646492004, + "num_tokens": 776663077.0, + "step": 20360 + }, + { + "epoch": 2.5901284823813766, + "grad_norm": 1.5975879430770874, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8838289976119995, + "num_tokens": 776700748.0, + "step": 20361 + }, + { + "epoch": 2.5902556926599667, + "grad_norm": 1.7011518478393555, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8968534469604492, + "num_tokens": 776730168.0, + "step": 20362 + }, + { + "epoch": 2.5903829029385577, + "grad_norm": 1.4592379331588745, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8830526471138, + "num_tokens": 776773546.0, + "step": 20363 + }, + { + "epoch": 2.5905101132171477, + "grad_norm": 1.5597155094146729, + "learning_rate": 1e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.8945066928863525, + "num_tokens": 776809726.0, + "step": 20364 + }, + { + "epoch": 2.5906373234957387, + "grad_norm": 1.7171601057052612, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8815076351165771, + "num_tokens": 776846097.0, + "step": 20365 + }, + { + "epoch": 2.590764533774329, + "grad_norm": 1.6280403137207031, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8725679516792297, + "num_tokens": 776885421.0, + "step": 20366 + }, + { + "epoch": 2.5908917440529193, + "grad_norm": 1.5977143049240112, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8826583623886108, + "num_tokens": 776929810.0, + "step": 20367 + }, + { + "epoch": 2.59101895433151, + "grad_norm": 1.7359609603881836, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8869194984436035, + "num_tokens": 776961574.0, + "step": 20368 + }, + { + "epoch": 2.5911461646101004, + "grad_norm": 1.4632190465927124, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8909949064254761, + "num_tokens": 777001617.0, + "step": 20369 + }, + { + "epoch": 2.591273374888691, + "grad_norm": 1.636812686920166, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8723310232162476, + "num_tokens": 777043310.0, + "step": 20370 + }, + { + "epoch": 2.5914005851672814, + "grad_norm": 1.524428367614746, + "learning_rate": 1e-06, + "loss": 0.2744, + "mean_token_accuracy": 0.8994455337524414, + "num_tokens": 777079079.0, + "step": 20371 + }, + { + "epoch": 2.591527795445872, + "grad_norm": 1.7014662027359009, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8772908449172974, + "num_tokens": 777115300.0, + "step": 20372 + }, + { + "epoch": 2.5916550057244625, + "grad_norm": 1.4868136644363403, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8884060382843018, + "num_tokens": 777154353.0, + "step": 20373 + }, + { + "epoch": 2.591782216003053, + "grad_norm": 1.4938170909881592, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8927042484283447, + "num_tokens": 777189192.0, + "step": 20374 + }, + { + "epoch": 2.5919094262816436, + "grad_norm": 1.600263237953186, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8863219618797302, + "num_tokens": 777227687.0, + "step": 20375 + }, + { + "epoch": 2.592036636560234, + "grad_norm": 1.6324542760849, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8688294291496277, + "num_tokens": 777266822.0, + "step": 20376 + }, + { + "epoch": 2.5921638468388246, + "grad_norm": 1.6798391342163086, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8705356121063232, + "num_tokens": 777302678.0, + "step": 20377 + }, + { + "epoch": 2.592291057117415, + "grad_norm": 1.570621371269226, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8866097927093506, + "num_tokens": 777338618.0, + "step": 20378 + }, + { + "epoch": 2.5924182673960057, + "grad_norm": 1.5294233560562134, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8902754783630371, + "num_tokens": 777374303.0, + "step": 20379 + }, + { + "epoch": 2.592545477674596, + "grad_norm": 1.5533740520477295, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8807451725006104, + "num_tokens": 777411091.0, + "step": 20380 + }, + { + "epoch": 2.5926726879531867, + "grad_norm": 1.563968300819397, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8657746911048889, + "num_tokens": 777456542.0, + "step": 20381 + }, + { + "epoch": 2.5927998982317773, + "grad_norm": 1.631237268447876, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8950726985931396, + "num_tokens": 777493292.0, + "step": 20382 + }, + { + "epoch": 2.592927108510368, + "grad_norm": 1.542318344116211, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8792234659194946, + "num_tokens": 777536366.0, + "step": 20383 + }, + { + "epoch": 2.5930543187889583, + "grad_norm": 1.6215928792953491, + "learning_rate": 1e-06, + "loss": 0.2663, + "mean_token_accuracy": 0.9001221656799316, + "num_tokens": 777567017.0, + "step": 20384 + }, + { + "epoch": 2.5931815290675484, + "grad_norm": 1.640450119972229, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8886205554008484, + "num_tokens": 777602118.0, + "step": 20385 + }, + { + "epoch": 2.5933087393461394, + "grad_norm": 1.6518115997314453, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8906190395355225, + "num_tokens": 777634156.0, + "step": 20386 + }, + { + "epoch": 2.5934359496247295, + "grad_norm": 1.4926037788391113, + "learning_rate": 1e-06, + "loss": 0.2858, + "mean_token_accuracy": 0.8960407972335815, + "num_tokens": 777672468.0, + "step": 20387 + }, + { + "epoch": 2.5935631599033204, + "grad_norm": 1.6152437925338745, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8815017938613892, + "num_tokens": 777712790.0, + "step": 20388 + }, + { + "epoch": 2.5936903701819105, + "grad_norm": 1.6092491149902344, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8810991644859314, + "num_tokens": 777751203.0, + "step": 20389 + }, + { + "epoch": 2.5938175804605015, + "grad_norm": 1.563378930091858, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.886044979095459, + "num_tokens": 777789948.0, + "step": 20390 + }, + { + "epoch": 2.5939447907390916, + "grad_norm": 1.5861412286758423, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8799580335617065, + "num_tokens": 777825085.0, + "step": 20391 + }, + { + "epoch": 2.594072001017682, + "grad_norm": 1.564201831817627, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8917468786239624, + "num_tokens": 777861879.0, + "step": 20392 + }, + { + "epoch": 2.5941992112962726, + "grad_norm": 1.4289839267730713, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8912398815155029, + "num_tokens": 777903966.0, + "step": 20393 + }, + { + "epoch": 2.594326421574863, + "grad_norm": 1.5503231287002563, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8701930046081543, + "num_tokens": 777945849.0, + "step": 20394 + }, + { + "epoch": 2.5944536318534537, + "grad_norm": 1.5615339279174805, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8931065797805786, + "num_tokens": 777982360.0, + "step": 20395 + }, + { + "epoch": 2.594580842132044, + "grad_norm": 1.5216612815856934, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8799430131912231, + "num_tokens": 778025920.0, + "step": 20396 + }, + { + "epoch": 2.5947080524106347, + "grad_norm": 1.55324387550354, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8681405186653137, + "num_tokens": 778067633.0, + "step": 20397 + }, + { + "epoch": 2.5948352626892253, + "grad_norm": 1.5626230239868164, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8888391852378845, + "num_tokens": 778101268.0, + "step": 20398 + }, + { + "epoch": 2.594962472967816, + "grad_norm": 1.5746691226959229, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8870443105697632, + "num_tokens": 778142587.0, + "step": 20399 + }, + { + "epoch": 2.5950896832464063, + "grad_norm": 1.5653960704803467, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8806944489479065, + "num_tokens": 778179399.0, + "step": 20400 + }, + { + "epoch": 2.595216893524997, + "grad_norm": 1.5648343563079834, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8839249610900879, + "num_tokens": 778215178.0, + "step": 20401 + }, + { + "epoch": 2.5953441038035874, + "grad_norm": 1.5995293855667114, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8875221610069275, + "num_tokens": 778249688.0, + "step": 20402 + }, + { + "epoch": 2.595471314082178, + "grad_norm": 1.4405581951141357, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8929596543312073, + "num_tokens": 778293673.0, + "step": 20403 + }, + { + "epoch": 2.5955985243607684, + "grad_norm": 1.6665220260620117, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8643649816513062, + "num_tokens": 778333297.0, + "step": 20404 + }, + { + "epoch": 2.595725734639359, + "grad_norm": 1.5140049457550049, + "learning_rate": 1e-06, + "loss": 0.275, + "mean_token_accuracy": 0.9003422260284424, + "num_tokens": 778368329.0, + "step": 20405 + }, + { + "epoch": 2.5958529449179495, + "grad_norm": 1.4307221174240112, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8850929737091064, + "num_tokens": 778410328.0, + "step": 20406 + }, + { + "epoch": 2.59598015519654, + "grad_norm": 1.5249186754226685, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8772602081298828, + "num_tokens": 778451578.0, + "step": 20407 + }, + { + "epoch": 2.5961073654751305, + "grad_norm": 1.4364173412322998, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8955239057540894, + "num_tokens": 778493845.0, + "step": 20408 + }, + { + "epoch": 2.596234575753721, + "grad_norm": 1.5794603824615479, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8913265466690063, + "num_tokens": 778528692.0, + "step": 20409 + }, + { + "epoch": 2.596361786032311, + "grad_norm": 1.4317888021469116, + "learning_rate": 1e-06, + "loss": 0.2767, + "mean_token_accuracy": 0.898722231388092, + "num_tokens": 778570841.0, + "step": 20410 + }, + { + "epoch": 2.596488996310902, + "grad_norm": 1.535009741783142, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8834609389305115, + "num_tokens": 778612017.0, + "step": 20411 + }, + { + "epoch": 2.596616206589492, + "grad_norm": 1.7267414331436157, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8874409198760986, + "num_tokens": 778644811.0, + "step": 20412 + }, + { + "epoch": 2.596743416868083, + "grad_norm": 1.6020407676696777, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8870155811309814, + "num_tokens": 778680179.0, + "step": 20413 + }, + { + "epoch": 2.5968706271466733, + "grad_norm": 1.6491667032241821, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8780425190925598, + "num_tokens": 778716711.0, + "step": 20414 + }, + { + "epoch": 2.596997837425264, + "grad_norm": 1.3990334272384644, + "learning_rate": 1e-06, + "loss": 0.292, + "mean_token_accuracy": 0.8966550827026367, + "num_tokens": 778758524.0, + "step": 20415 + }, + { + "epoch": 2.5971250477038543, + "grad_norm": 1.6015825271606445, + "learning_rate": 1e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.8947725296020508, + "num_tokens": 778792896.0, + "step": 20416 + }, + { + "epoch": 2.597252257982445, + "grad_norm": 1.5244725942611694, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8879246711730957, + "num_tokens": 778833393.0, + "step": 20417 + }, + { + "epoch": 2.5973794682610354, + "grad_norm": 1.6785155534744263, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8787841796875, + "num_tokens": 778868297.0, + "step": 20418 + }, + { + "epoch": 2.597506678539626, + "grad_norm": 1.5386172533035278, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8939511775970459, + "num_tokens": 778903568.0, + "step": 20419 + }, + { + "epoch": 2.5976338888182164, + "grad_norm": 1.7716768980026245, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8765103816986084, + "num_tokens": 778935514.0, + "step": 20420 + }, + { + "epoch": 2.597761099096807, + "grad_norm": 1.6102160215377808, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8799495697021484, + "num_tokens": 778975021.0, + "step": 20421 + }, + { + "epoch": 2.5978883093753975, + "grad_norm": 1.5685644149780273, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8704639673233032, + "num_tokens": 779013302.0, + "step": 20422 + }, + { + "epoch": 2.598015519653988, + "grad_norm": 1.6687198877334595, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8808845281600952, + "num_tokens": 779048963.0, + "step": 20423 + }, + { + "epoch": 2.5981427299325786, + "grad_norm": 1.7311151027679443, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8825621604919434, + "num_tokens": 779079806.0, + "step": 20424 + }, + { + "epoch": 2.598269940211169, + "grad_norm": 1.516609787940979, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8867210149765015, + "num_tokens": 779116664.0, + "step": 20425 + }, + { + "epoch": 2.5983971504897596, + "grad_norm": 1.7457855939865112, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8676503300666809, + "num_tokens": 779152276.0, + "step": 20426 + }, + { + "epoch": 2.59852436076835, + "grad_norm": 1.6147994995117188, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8870295882225037, + "num_tokens": 779189271.0, + "step": 20427 + }, + { + "epoch": 2.5986515710469407, + "grad_norm": 1.5832072496414185, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8796801567077637, + "num_tokens": 779232231.0, + "step": 20428 + }, + { + "epoch": 2.598778781325531, + "grad_norm": 1.382238507270813, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8922143578529358, + "num_tokens": 779279376.0, + "step": 20429 + }, + { + "epoch": 2.5989059916041217, + "grad_norm": 1.415905237197876, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8941760063171387, + "num_tokens": 779317902.0, + "step": 20430 + }, + { + "epoch": 2.5990332018827123, + "grad_norm": 1.4438520669937134, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8745471835136414, + "num_tokens": 779362393.0, + "step": 20431 + }, + { + "epoch": 2.599160412161303, + "grad_norm": 1.545836091041565, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8914604187011719, + "num_tokens": 779401275.0, + "step": 20432 + }, + { + "epoch": 2.5992876224398933, + "grad_norm": 1.6905261278152466, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8889085650444031, + "num_tokens": 779437258.0, + "step": 20433 + }, + { + "epoch": 2.599414832718484, + "grad_norm": 1.468474268913269, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8970972299575806, + "num_tokens": 779483451.0, + "step": 20434 + }, + { + "epoch": 2.599542042997074, + "grad_norm": 1.5709630250930786, + "learning_rate": 1e-06, + "loss": 0.2788, + "mean_token_accuracy": 0.8982645869255066, + "num_tokens": 779518594.0, + "step": 20435 + }, + { + "epoch": 2.599669253275665, + "grad_norm": 1.6603361368179321, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8793078660964966, + "num_tokens": 779554995.0, + "step": 20436 + }, + { + "epoch": 2.599796463554255, + "grad_norm": 1.7407891750335693, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8815229535102844, + "num_tokens": 779586926.0, + "step": 20437 + }, + { + "epoch": 2.599923673832846, + "grad_norm": 1.6471883058547974, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8840711116790771, + "num_tokens": 779621993.0, + "step": 20438 + }, + { + "epoch": 2.600050884111436, + "grad_norm": 1.6493346691131592, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8938645720481873, + "num_tokens": 779655328.0, + "step": 20439 + }, + { + "epoch": 2.6001780943900266, + "grad_norm": 1.6654082536697388, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8782229423522949, + "num_tokens": 779692439.0, + "step": 20440 + }, + { + "epoch": 2.600305304668617, + "grad_norm": 1.5278929471969604, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8910102844238281, + "num_tokens": 779730571.0, + "step": 20441 + }, + { + "epoch": 2.6004325149472076, + "grad_norm": 1.647516131401062, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8868831992149353, + "num_tokens": 779763371.0, + "step": 20442 + }, + { + "epoch": 2.600559725225798, + "grad_norm": 1.524422526359558, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8798707723617554, + "num_tokens": 779803095.0, + "step": 20443 + }, + { + "epoch": 2.6006869355043887, + "grad_norm": 1.5099469423294067, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8889448642730713, + "num_tokens": 779842597.0, + "step": 20444 + }, + { + "epoch": 2.600814145782979, + "grad_norm": 1.5232936143875122, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8833590745925903, + "num_tokens": 779884559.0, + "step": 20445 + }, + { + "epoch": 2.6009413560615697, + "grad_norm": 1.6169214248657227, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.8922229409217834, + "num_tokens": 779920020.0, + "step": 20446 + }, + { + "epoch": 2.6010685663401603, + "grad_norm": 1.6248866319656372, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8788427114486694, + "num_tokens": 779956146.0, + "step": 20447 + }, + { + "epoch": 2.601195776618751, + "grad_norm": 1.6487585306167603, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8679090738296509, + "num_tokens": 779994343.0, + "step": 20448 + }, + { + "epoch": 2.6013229868973413, + "grad_norm": 1.64091956615448, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8893643021583557, + "num_tokens": 780027552.0, + "step": 20449 + }, + { + "epoch": 2.601450197175932, + "grad_norm": 1.5896384716033936, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8879917860031128, + "num_tokens": 780063011.0, + "step": 20450 + }, + { + "epoch": 2.6015774074545224, + "grad_norm": 1.4096698760986328, + "learning_rate": 1e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.8949005603790283, + "num_tokens": 780106573.0, + "step": 20451 + }, + { + "epoch": 2.601704617733113, + "grad_norm": 1.5539742708206177, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8837954998016357, + "num_tokens": 780148119.0, + "step": 20452 + }, + { + "epoch": 2.6018318280117034, + "grad_norm": 1.588822603225708, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8873353004455566, + "num_tokens": 780188921.0, + "step": 20453 + }, + { + "epoch": 2.601959038290294, + "grad_norm": 1.542759895324707, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8793010711669922, + "num_tokens": 780229772.0, + "step": 20454 + }, + { + "epoch": 2.6020862485688845, + "grad_norm": 1.5203593969345093, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8926181793212891, + "num_tokens": 780267544.0, + "step": 20455 + }, + { + "epoch": 2.602213458847475, + "grad_norm": 1.4778516292572021, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8906736373901367, + "num_tokens": 780307748.0, + "step": 20456 + }, + { + "epoch": 2.6023406691260655, + "grad_norm": 1.5928434133529663, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8683902025222778, + "num_tokens": 780344621.0, + "step": 20457 + }, + { + "epoch": 2.6024678794046556, + "grad_norm": 1.4527440071105957, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8943600654602051, + "num_tokens": 780386542.0, + "step": 20458 + }, + { + "epoch": 2.6025950896832466, + "grad_norm": 1.4062855243682861, + "learning_rate": 1e-06, + "loss": 0.2734, + "mean_token_accuracy": 0.9010003209114075, + "num_tokens": 780429388.0, + "step": 20459 + }, + { + "epoch": 2.6027222999618367, + "grad_norm": 1.443616509437561, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8968194723129272, + "num_tokens": 780468854.0, + "step": 20460 + }, + { + "epoch": 2.6028495102404277, + "grad_norm": 1.8029001951217651, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8744734525680542, + "num_tokens": 780502033.0, + "step": 20461 + }, + { + "epoch": 2.6029767205190177, + "grad_norm": 1.563905119895935, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8887099027633667, + "num_tokens": 780540711.0, + "step": 20462 + }, + { + "epoch": 2.6031039307976087, + "grad_norm": 1.4210727214813232, + "learning_rate": 1e-06, + "loss": 0.2583, + "mean_token_accuracy": 0.9064764976501465, + "num_tokens": 780578459.0, + "step": 20463 + }, + { + "epoch": 2.603231141076199, + "grad_norm": 1.6311068534851074, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8853445053100586, + "num_tokens": 780612446.0, + "step": 20464 + }, + { + "epoch": 2.6033583513547893, + "grad_norm": 1.5867352485656738, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.873521089553833, + "num_tokens": 780649050.0, + "step": 20465 + }, + { + "epoch": 2.60348556163338, + "grad_norm": 1.5928386449813843, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8921709656715393, + "num_tokens": 780685936.0, + "step": 20466 + }, + { + "epoch": 2.6036127719119704, + "grad_norm": 1.5536704063415527, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8850542306900024, + "num_tokens": 780728827.0, + "step": 20467 + }, + { + "epoch": 2.603739982190561, + "grad_norm": 1.5871354341506958, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8969459533691406, + "num_tokens": 780764181.0, + "step": 20468 + }, + { + "epoch": 2.6038671924691514, + "grad_norm": 1.4578897953033447, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8810588121414185, + "num_tokens": 780809408.0, + "step": 20469 + }, + { + "epoch": 2.603994402747742, + "grad_norm": 1.5671727657318115, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8908218741416931, + "num_tokens": 780849158.0, + "step": 20470 + }, + { + "epoch": 2.6041216130263325, + "grad_norm": 1.572843074798584, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8879063129425049, + "num_tokens": 780889636.0, + "step": 20471 + }, + { + "epoch": 2.604248823304923, + "grad_norm": 1.698555588722229, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8854783177375793, + "num_tokens": 780926112.0, + "step": 20472 + }, + { + "epoch": 2.6043760335835135, + "grad_norm": 1.7905516624450684, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8796869516372681, + "num_tokens": 780961622.0, + "step": 20473 + }, + { + "epoch": 2.604503243862104, + "grad_norm": 1.5888363122940063, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8678843975067139, + "num_tokens": 781003574.0, + "step": 20474 + }, + { + "epoch": 2.6046304541406946, + "grad_norm": 1.7181867361068726, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.886978030204773, + "num_tokens": 781036027.0, + "step": 20475 + }, + { + "epoch": 2.604757664419285, + "grad_norm": 1.5515742301940918, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8794182538986206, + "num_tokens": 781076360.0, + "step": 20476 + }, + { + "epoch": 2.6048848746978757, + "grad_norm": 1.5485466718673706, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8865672945976257, + "num_tokens": 781116953.0, + "step": 20477 + }, + { + "epoch": 2.605012084976466, + "grad_norm": 1.4642280340194702, + "learning_rate": 1e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.8989378809928894, + "num_tokens": 781159239.0, + "step": 20478 + }, + { + "epoch": 2.6051392952550567, + "grad_norm": 1.5115878582000732, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8878659009933472, + "num_tokens": 781200049.0, + "step": 20479 + }, + { + "epoch": 2.6052665055336472, + "grad_norm": 1.4731950759887695, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8912287950515747, + "num_tokens": 781242687.0, + "step": 20480 + }, + { + "epoch": 2.6053937158122378, + "grad_norm": 1.5521478652954102, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8843789100646973, + "num_tokens": 781277761.0, + "step": 20481 + }, + { + "epoch": 2.6055209260908283, + "grad_norm": 1.766572117805481, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.876939594745636, + "num_tokens": 781315606.0, + "step": 20482 + }, + { + "epoch": 2.6056481363694184, + "grad_norm": 1.6545634269714355, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8898895978927612, + "num_tokens": 781348852.0, + "step": 20483 + }, + { + "epoch": 2.6057753466480094, + "grad_norm": 1.5397279262542725, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.889958918094635, + "num_tokens": 781385353.0, + "step": 20484 + }, + { + "epoch": 2.6059025569265994, + "grad_norm": 1.6367521286010742, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8898804783821106, + "num_tokens": 781420047.0, + "step": 20485 + }, + { + "epoch": 2.6060297672051904, + "grad_norm": 1.539302110671997, + "learning_rate": 1e-06, + "loss": 0.2838, + "mean_token_accuracy": 0.8995411992073059, + "num_tokens": 781457978.0, + "step": 20486 + }, + { + "epoch": 2.6061569774837805, + "grad_norm": 1.5334421396255493, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8804512023925781, + "num_tokens": 781498330.0, + "step": 20487 + }, + { + "epoch": 2.6062841877623715, + "grad_norm": 1.5725494623184204, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8941280841827393, + "num_tokens": 781533666.0, + "step": 20488 + }, + { + "epoch": 2.6064113980409616, + "grad_norm": 1.4895986318588257, + "learning_rate": 1e-06, + "loss": 0.296, + "mean_token_accuracy": 0.8969470858573914, + "num_tokens": 781572387.0, + "step": 20489 + }, + { + "epoch": 2.606538608319552, + "grad_norm": 1.4669798612594604, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8814700841903687, + "num_tokens": 781612453.0, + "step": 20490 + }, + { + "epoch": 2.6066658185981426, + "grad_norm": 1.5786235332489014, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8651817440986633, + "num_tokens": 781654328.0, + "step": 20491 + }, + { + "epoch": 2.606793028876733, + "grad_norm": 1.4955143928527832, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8848083019256592, + "num_tokens": 781698265.0, + "step": 20492 + }, + { + "epoch": 2.6069202391553237, + "grad_norm": 1.621151328086853, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8608423471450806, + "num_tokens": 781739918.0, + "step": 20493 + }, + { + "epoch": 2.607047449433914, + "grad_norm": 1.6416882276535034, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8845025300979614, + "num_tokens": 781775632.0, + "step": 20494 + }, + { + "epoch": 2.6071746597125047, + "grad_norm": 1.5535327196121216, + "learning_rate": 1e-06, + "loss": 0.2862, + "mean_token_accuracy": 0.8949316740036011, + "num_tokens": 781814026.0, + "step": 20495 + }, + { + "epoch": 2.6073018699910953, + "grad_norm": 1.5097893476486206, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8873496055603027, + "num_tokens": 781853207.0, + "step": 20496 + }, + { + "epoch": 2.607429080269686, + "grad_norm": 1.5983623266220093, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8844333291053772, + "num_tokens": 781888262.0, + "step": 20497 + }, + { + "epoch": 2.6075562905482763, + "grad_norm": 1.4352675676345825, + "learning_rate": 1e-06, + "loss": 0.2887, + "mean_token_accuracy": 0.8947271108627319, + "num_tokens": 781933220.0, + "step": 20498 + }, + { + "epoch": 2.607683500826867, + "grad_norm": 1.7252269983291626, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8854819536209106, + "num_tokens": 781966091.0, + "step": 20499 + }, + { + "epoch": 2.6078107111054574, + "grad_norm": 1.5085710287094116, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8922401666641235, + "num_tokens": 782005532.0, + "step": 20500 + }, + { + "epoch": 2.607937921384048, + "grad_norm": 1.6161843538284302, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8810057044029236, + "num_tokens": 782041158.0, + "step": 20501 + }, + { + "epoch": 2.6080651316626384, + "grad_norm": 1.6004291772842407, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8887592554092407, + "num_tokens": 782076150.0, + "step": 20502 + }, + { + "epoch": 2.608192341941229, + "grad_norm": 1.5764150619506836, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8868112564086914, + "num_tokens": 782113327.0, + "step": 20503 + }, + { + "epoch": 2.6083195522198195, + "grad_norm": 1.539109230041504, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8793708086013794, + "num_tokens": 782151679.0, + "step": 20504 + }, + { + "epoch": 2.60844676249841, + "grad_norm": 1.6190167665481567, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8750510215759277, + "num_tokens": 782191326.0, + "step": 20505 + }, + { + "epoch": 2.6085739727770005, + "grad_norm": 1.653411865234375, + "learning_rate": 1e-06, + "loss": 0.2832, + "mean_token_accuracy": 0.8995641469955444, + "num_tokens": 782224130.0, + "step": 20506 + }, + { + "epoch": 2.608701183055591, + "grad_norm": 1.615192174911499, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8882788419723511, + "num_tokens": 782260368.0, + "step": 20507 + }, + { + "epoch": 2.608828393334181, + "grad_norm": 1.459717035293579, + "learning_rate": 1e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.898135781288147, + "num_tokens": 782299088.0, + "step": 20508 + }, + { + "epoch": 2.608955603612772, + "grad_norm": 1.4599714279174805, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8934289813041687, + "num_tokens": 782339153.0, + "step": 20509 + }, + { + "epoch": 2.609082813891362, + "grad_norm": 1.6460579633712769, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8849443197250366, + "num_tokens": 782373823.0, + "step": 20510 + }, + { + "epoch": 2.609210024169953, + "grad_norm": 1.5776745080947876, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8819399476051331, + "num_tokens": 782412445.0, + "step": 20511 + }, + { + "epoch": 2.6093372344485433, + "grad_norm": 1.529211163520813, + "learning_rate": 1e-06, + "loss": 0.2738, + "mean_token_accuracy": 0.9001466631889343, + "num_tokens": 782449884.0, + "step": 20512 + }, + { + "epoch": 2.609464444727134, + "grad_norm": 1.6667078733444214, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8712780475616455, + "num_tokens": 782486479.0, + "step": 20513 + }, + { + "epoch": 2.6095916550057243, + "grad_norm": 1.474361538887024, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8904438614845276, + "num_tokens": 782529254.0, + "step": 20514 + }, + { + "epoch": 2.609718865284315, + "grad_norm": 1.5697506666183472, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8918313980102539, + "num_tokens": 782563028.0, + "step": 20515 + }, + { + "epoch": 2.6098460755629054, + "grad_norm": 1.427579402923584, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.884085476398468, + "num_tokens": 782609819.0, + "step": 20516 + }, + { + "epoch": 2.609973285841496, + "grad_norm": 1.5676348209381104, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8904036283493042, + "num_tokens": 782648141.0, + "step": 20517 + }, + { + "epoch": 2.6101004961200864, + "grad_norm": 1.530328631401062, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8909670114517212, + "num_tokens": 782684697.0, + "step": 20518 + }, + { + "epoch": 2.610227706398677, + "grad_norm": 1.5159801244735718, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8859676718711853, + "num_tokens": 782723060.0, + "step": 20519 + }, + { + "epoch": 2.6103549166772675, + "grad_norm": 1.6146172285079956, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.870980441570282, + "num_tokens": 782762236.0, + "step": 20520 + }, + { + "epoch": 2.610482126955858, + "grad_norm": 1.4866783618927002, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8876572847366333, + "num_tokens": 782802603.0, + "step": 20521 + }, + { + "epoch": 2.6106093372344485, + "grad_norm": 1.441418170928955, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8844184875488281, + "num_tokens": 782844697.0, + "step": 20522 + }, + { + "epoch": 2.610736547513039, + "grad_norm": 1.6227835416793823, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8867075443267822, + "num_tokens": 782885776.0, + "step": 20523 + }, + { + "epoch": 2.6108637577916296, + "grad_norm": 1.7206445932388306, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8870843648910522, + "num_tokens": 782916838.0, + "step": 20524 + }, + { + "epoch": 2.61099096807022, + "grad_norm": 1.7467870712280273, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8736585974693298, + "num_tokens": 782951904.0, + "step": 20525 + }, + { + "epoch": 2.6111181783488107, + "grad_norm": 1.651118516921997, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.883577287197113, + "num_tokens": 782987064.0, + "step": 20526 + }, + { + "epoch": 2.611245388627401, + "grad_norm": 1.507014513015747, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8753246068954468, + "num_tokens": 783029746.0, + "step": 20527 + }, + { + "epoch": 2.6113725989059917, + "grad_norm": 1.5285650491714478, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8843886852264404, + "num_tokens": 783071123.0, + "step": 20528 + }, + { + "epoch": 2.6114998091845822, + "grad_norm": 1.489109992980957, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8895425796508789, + "num_tokens": 783112235.0, + "step": 20529 + }, + { + "epoch": 2.6116270194631728, + "grad_norm": 1.5863369703292847, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8794715404510498, + "num_tokens": 783152285.0, + "step": 20530 + }, + { + "epoch": 2.6117542297417633, + "grad_norm": 1.4736360311508179, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8916219472885132, + "num_tokens": 783190275.0, + "step": 20531 + }, + { + "epoch": 2.611881440020354, + "grad_norm": 1.6334729194641113, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8703258633613586, + "num_tokens": 783226794.0, + "step": 20532 + }, + { + "epoch": 2.612008650298944, + "grad_norm": 1.6109163761138916, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8805703520774841, + "num_tokens": 783261573.0, + "step": 20533 + }, + { + "epoch": 2.612135860577535, + "grad_norm": 1.6356096267700195, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.885771632194519, + "num_tokens": 783296402.0, + "step": 20534 + }, + { + "epoch": 2.612263070856125, + "grad_norm": 1.4659016132354736, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8876669406890869, + "num_tokens": 783339810.0, + "step": 20535 + }, + { + "epoch": 2.612390281134716, + "grad_norm": 1.6130317449569702, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8847619295120239, + "num_tokens": 783377772.0, + "step": 20536 + }, + { + "epoch": 2.612517491413306, + "grad_norm": 1.5033270120620728, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8851501941680908, + "num_tokens": 783422226.0, + "step": 20537 + }, + { + "epoch": 2.6126447016918966, + "grad_norm": 1.4169448614120483, + "learning_rate": 1e-06, + "loss": 0.2782, + "mean_token_accuracy": 0.8962084054946899, + "num_tokens": 783464659.0, + "step": 20538 + }, + { + "epoch": 2.612771911970487, + "grad_norm": 1.7801247835159302, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8818114399909973, + "num_tokens": 783497870.0, + "step": 20539 + }, + { + "epoch": 2.6128991222490776, + "grad_norm": 1.576826810836792, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8809522390365601, + "num_tokens": 783536635.0, + "step": 20540 + }, + { + "epoch": 2.613026332527668, + "grad_norm": 1.5496869087219238, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8641768097877502, + "num_tokens": 783577122.0, + "step": 20541 + }, + { + "epoch": 2.6131535428062587, + "grad_norm": 1.541632056236267, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8832924365997314, + "num_tokens": 783615942.0, + "step": 20542 + }, + { + "epoch": 2.613280753084849, + "grad_norm": 1.6208022832870483, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8803694248199463, + "num_tokens": 783651391.0, + "step": 20543 + }, + { + "epoch": 2.6134079633634397, + "grad_norm": 1.4674887657165527, + "learning_rate": 1e-06, + "loss": 0.2711, + "mean_token_accuracy": 0.9002023935317993, + "num_tokens": 783689435.0, + "step": 20544 + }, + { + "epoch": 2.6135351736420303, + "grad_norm": 1.4669711589813232, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.890326976776123, + "num_tokens": 783731111.0, + "step": 20545 + }, + { + "epoch": 2.613662383920621, + "grad_norm": 1.5673750638961792, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8809759616851807, + "num_tokens": 783770289.0, + "step": 20546 + }, + { + "epoch": 2.6137895941992113, + "grad_norm": 1.6072213649749756, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8792428970336914, + "num_tokens": 783809005.0, + "step": 20547 + }, + { + "epoch": 2.613916804477802, + "grad_norm": 1.496455192565918, + "learning_rate": 1e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.8969680070877075, + "num_tokens": 783848332.0, + "step": 20548 + }, + { + "epoch": 2.6140440147563924, + "grad_norm": 1.650294303894043, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8926470279693604, + "num_tokens": 783884396.0, + "step": 20549 + }, + { + "epoch": 2.614171225034983, + "grad_norm": 1.6450358629226685, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8935362100601196, + "num_tokens": 783922061.0, + "step": 20550 + }, + { + "epoch": 2.6142984353135734, + "grad_norm": 1.5711992979049683, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.88792884349823, + "num_tokens": 783959911.0, + "step": 20551 + }, + { + "epoch": 2.614425645592164, + "grad_norm": 1.5000554323196411, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8798118829727173, + "num_tokens": 783999789.0, + "step": 20552 + }, + { + "epoch": 2.6145528558707545, + "grad_norm": 1.785910725593567, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8777123689651489, + "num_tokens": 784034022.0, + "step": 20553 + }, + { + "epoch": 2.614680066149345, + "grad_norm": 1.5337177515029907, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8930358290672302, + "num_tokens": 784069936.0, + "step": 20554 + }, + { + "epoch": 2.6148072764279355, + "grad_norm": 1.5802382230758667, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8844408392906189, + "num_tokens": 784107494.0, + "step": 20555 + }, + { + "epoch": 2.6149344867065256, + "grad_norm": 1.5060665607452393, + "learning_rate": 1e-06, + "loss": 0.2674, + "mean_token_accuracy": 0.902753472328186, + "num_tokens": 784140634.0, + "step": 20556 + }, + { + "epoch": 2.6150616969851166, + "grad_norm": 1.5103929042816162, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8939926624298096, + "num_tokens": 784182248.0, + "step": 20557 + }, + { + "epoch": 2.6151889072637067, + "grad_norm": 1.3893650770187378, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8836187124252319, + "num_tokens": 784226822.0, + "step": 20558 + }, + { + "epoch": 2.6153161175422976, + "grad_norm": 1.4033750295639038, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8894460797309875, + "num_tokens": 784269731.0, + "step": 20559 + }, + { + "epoch": 2.6154433278208877, + "grad_norm": 1.434235692024231, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8829017877578735, + "num_tokens": 784315732.0, + "step": 20560 + }, + { + "epoch": 2.6155705380994787, + "grad_norm": 1.4766581058502197, + "learning_rate": 1e-06, + "loss": 0.2706, + "mean_token_accuracy": 0.9014908075332642, + "num_tokens": 784349127.0, + "step": 20561 + }, + { + "epoch": 2.615697748378069, + "grad_norm": 1.6360559463500977, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8852395415306091, + "num_tokens": 784385473.0, + "step": 20562 + }, + { + "epoch": 2.6158249586566593, + "grad_norm": 1.5660237073898315, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8861521482467651, + "num_tokens": 784424502.0, + "step": 20563 + }, + { + "epoch": 2.61595216893525, + "grad_norm": 1.6197775602340698, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8724093437194824, + "num_tokens": 784464884.0, + "step": 20564 + }, + { + "epoch": 2.6160793792138404, + "grad_norm": 1.4804842472076416, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8902491331100464, + "num_tokens": 784507843.0, + "step": 20565 + }, + { + "epoch": 2.616206589492431, + "grad_norm": 1.4187538623809814, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8938060998916626, + "num_tokens": 784551745.0, + "step": 20566 + }, + { + "epoch": 2.6163337997710214, + "grad_norm": 1.5733047723770142, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8881900310516357, + "num_tokens": 784592264.0, + "step": 20567 + }, + { + "epoch": 2.616461010049612, + "grad_norm": 1.5679970979690552, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8802515864372253, + "num_tokens": 784633848.0, + "step": 20568 + }, + { + "epoch": 2.6165882203282025, + "grad_norm": 1.6021620035171509, + "learning_rate": 1e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.8957881331443787, + "num_tokens": 784666516.0, + "step": 20569 + }, + { + "epoch": 2.616715430606793, + "grad_norm": 1.4730579853057861, + "learning_rate": 1e-06, + "loss": 0.2793, + "mean_token_accuracy": 0.8963371515274048, + "num_tokens": 784703606.0, + "step": 20570 + }, + { + "epoch": 2.6168426408853835, + "grad_norm": 1.488421082496643, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8923209309577942, + "num_tokens": 784741662.0, + "step": 20571 + }, + { + "epoch": 2.616969851163974, + "grad_norm": 1.4641278982162476, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8864882588386536, + "num_tokens": 784782722.0, + "step": 20572 + }, + { + "epoch": 2.6170970614425646, + "grad_norm": 1.5462900400161743, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.8977474570274353, + "num_tokens": 784819590.0, + "step": 20573 + }, + { + "epoch": 2.617224271721155, + "grad_norm": 1.5333480834960938, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8776096105575562, + "num_tokens": 784857575.0, + "step": 20574 + }, + { + "epoch": 2.6173514819997457, + "grad_norm": 1.6358205080032349, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8919475078582764, + "num_tokens": 784892558.0, + "step": 20575 + }, + { + "epoch": 2.617478692278336, + "grad_norm": 1.574330449104309, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8816197514533997, + "num_tokens": 784931121.0, + "step": 20576 + }, + { + "epoch": 2.6176059025569267, + "grad_norm": 1.5081748962402344, + "learning_rate": 1e-06, + "loss": 0.2683, + "mean_token_accuracy": 0.9016080498695374, + "num_tokens": 784969048.0, + "step": 20577 + }, + { + "epoch": 2.6177331128355172, + "grad_norm": 1.5479252338409424, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8756731748580933, + "num_tokens": 785010071.0, + "step": 20578 + }, + { + "epoch": 2.6178603231141078, + "grad_norm": 1.7002441883087158, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8812385201454163, + "num_tokens": 785046919.0, + "step": 20579 + }, + { + "epoch": 2.6179875333926983, + "grad_norm": 1.6697981357574463, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8913710117340088, + "num_tokens": 785079773.0, + "step": 20580 + }, + { + "epoch": 2.6181147436712884, + "grad_norm": 1.5457707643508911, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8889009356498718, + "num_tokens": 785117078.0, + "step": 20581 + }, + { + "epoch": 2.6182419539498794, + "grad_norm": 1.5024961233139038, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8771528601646423, + "num_tokens": 785157597.0, + "step": 20582 + }, + { + "epoch": 2.6183691642284694, + "grad_norm": 1.6138032674789429, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8849966526031494, + "num_tokens": 785189018.0, + "step": 20583 + }, + { + "epoch": 2.6184963745070604, + "grad_norm": 1.551806926727295, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.898438572883606, + "num_tokens": 785224965.0, + "step": 20584 + }, + { + "epoch": 2.6186235847856505, + "grad_norm": 1.4639952182769775, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.8944509029388428, + "num_tokens": 785266101.0, + "step": 20585 + }, + { + "epoch": 2.6187507950642415, + "grad_norm": 1.5024656057357788, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8958768844604492, + "num_tokens": 785303141.0, + "step": 20586 + }, + { + "epoch": 2.6188780053428315, + "grad_norm": 1.6034959554672241, + "learning_rate": 1e-06, + "loss": 0.2832, + "mean_token_accuracy": 0.8978907465934753, + "num_tokens": 785340361.0, + "step": 20587 + }, + { + "epoch": 2.619005215621422, + "grad_norm": 1.5649396181106567, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8949670195579529, + "num_tokens": 785375872.0, + "step": 20588 + }, + { + "epoch": 2.6191324259000126, + "grad_norm": 1.597412347793579, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8943816423416138, + "num_tokens": 785412335.0, + "step": 20589 + }, + { + "epoch": 2.619259636178603, + "grad_norm": 1.4950993061065674, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.89227694272995, + "num_tokens": 785450777.0, + "step": 20590 + }, + { + "epoch": 2.6193868464571937, + "grad_norm": 1.5688263177871704, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8699322938919067, + "num_tokens": 785494065.0, + "step": 20591 + }, + { + "epoch": 2.619514056735784, + "grad_norm": 1.5276460647583008, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8828484416007996, + "num_tokens": 785538235.0, + "step": 20592 + }, + { + "epoch": 2.6196412670143747, + "grad_norm": 1.5930315256118774, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8732283711433411, + "num_tokens": 785576597.0, + "step": 20593 + }, + { + "epoch": 2.6197684772929652, + "grad_norm": 1.397467017173767, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.8925148248672485, + "num_tokens": 785619349.0, + "step": 20594 + }, + { + "epoch": 2.6198956875715558, + "grad_norm": 1.7168538570404053, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8734115362167358, + "num_tokens": 785654714.0, + "step": 20595 + }, + { + "epoch": 2.6200228978501463, + "grad_norm": 1.4003620147705078, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8903406262397766, + "num_tokens": 785701504.0, + "step": 20596 + }, + { + "epoch": 2.620150108128737, + "grad_norm": 1.6578423976898193, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8853678703308105, + "num_tokens": 785735380.0, + "step": 20597 + }, + { + "epoch": 2.6202773184073274, + "grad_norm": 1.6387497186660767, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8874708414077759, + "num_tokens": 785770947.0, + "step": 20598 + }, + { + "epoch": 2.620404528685918, + "grad_norm": 1.3864408731460571, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8952988386154175, + "num_tokens": 785812100.0, + "step": 20599 + }, + { + "epoch": 2.6205317389645084, + "grad_norm": 1.5866979360580444, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8908348083496094, + "num_tokens": 785845356.0, + "step": 20600 + }, + { + "epoch": 2.620658949243099, + "grad_norm": 1.459183931350708, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8842777013778687, + "num_tokens": 785886626.0, + "step": 20601 + }, + { + "epoch": 2.6207861595216895, + "grad_norm": 1.4467988014221191, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.880939245223999, + "num_tokens": 785929918.0, + "step": 20602 + }, + { + "epoch": 2.62091336980028, + "grad_norm": 1.5002703666687012, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.886349081993103, + "num_tokens": 785969407.0, + "step": 20603 + }, + { + "epoch": 2.6210405800788705, + "grad_norm": 1.474737524986267, + "learning_rate": 1e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.9002195596694946, + "num_tokens": 786008059.0, + "step": 20604 + }, + { + "epoch": 2.621167790357461, + "grad_norm": 1.572134256362915, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8880170583724976, + "num_tokens": 786048148.0, + "step": 20605 + }, + { + "epoch": 2.621295000636051, + "grad_norm": 1.5374640226364136, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8875727653503418, + "num_tokens": 786086080.0, + "step": 20606 + }, + { + "epoch": 2.621422210914642, + "grad_norm": 1.5830990076065063, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8834326267242432, + "num_tokens": 786121622.0, + "step": 20607 + }, + { + "epoch": 2.621549421193232, + "grad_norm": 1.5148998498916626, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8936654329299927, + "num_tokens": 786158281.0, + "step": 20608 + }, + { + "epoch": 2.621676631471823, + "grad_norm": 1.5779918432235718, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8763774633407593, + "num_tokens": 786200310.0, + "step": 20609 + }, + { + "epoch": 2.6218038417504133, + "grad_norm": 1.636384129524231, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8827825784683228, + "num_tokens": 786234340.0, + "step": 20610 + }, + { + "epoch": 2.621931052029004, + "grad_norm": 1.6364672183990479, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8772744536399841, + "num_tokens": 786271972.0, + "step": 20611 + }, + { + "epoch": 2.6220582623075943, + "grad_norm": 1.4996392726898193, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8947156071662903, + "num_tokens": 786312763.0, + "step": 20612 + }, + { + "epoch": 2.622185472586185, + "grad_norm": 1.5866308212280273, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8850809335708618, + "num_tokens": 786351523.0, + "step": 20613 + }, + { + "epoch": 2.6223126828647754, + "grad_norm": 1.5550464391708374, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8826441764831543, + "num_tokens": 786387234.0, + "step": 20614 + }, + { + "epoch": 2.622439893143366, + "grad_norm": 1.7199156284332275, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8814941048622131, + "num_tokens": 786421649.0, + "step": 20615 + }, + { + "epoch": 2.6225671034219564, + "grad_norm": 1.4373308420181274, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8898392915725708, + "num_tokens": 786460023.0, + "step": 20616 + }, + { + "epoch": 2.622694313700547, + "grad_norm": 1.716782569885254, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8925374746322632, + "num_tokens": 786490414.0, + "step": 20617 + }, + { + "epoch": 2.6228215239791375, + "grad_norm": 1.4358420372009277, + "learning_rate": 1e-06, + "loss": 0.2846, + "mean_token_accuracy": 0.8963004350662231, + "num_tokens": 786530734.0, + "step": 20618 + }, + { + "epoch": 2.622948734257728, + "grad_norm": 1.6458982229232788, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8713160753250122, + "num_tokens": 786569626.0, + "step": 20619 + }, + { + "epoch": 2.6230759445363185, + "grad_norm": 1.4904085397720337, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8917239904403687, + "num_tokens": 786608871.0, + "step": 20620 + }, + { + "epoch": 2.623203154814909, + "grad_norm": 1.6513092517852783, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8859456777572632, + "num_tokens": 786642665.0, + "step": 20621 + }, + { + "epoch": 2.6233303650934996, + "grad_norm": 1.6983693838119507, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.882353663444519, + "num_tokens": 786674786.0, + "step": 20622 + }, + { + "epoch": 2.62345757537209, + "grad_norm": 1.4451935291290283, + "learning_rate": 1e-06, + "loss": 0.2893, + "mean_token_accuracy": 0.8962196111679077, + "num_tokens": 786717784.0, + "step": 20623 + }, + { + "epoch": 2.6235847856506807, + "grad_norm": 1.4826009273529053, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.88355553150177, + "num_tokens": 786758655.0, + "step": 20624 + }, + { + "epoch": 2.623711995929271, + "grad_norm": 1.5323731899261475, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8920434713363647, + "num_tokens": 786793114.0, + "step": 20625 + }, + { + "epoch": 2.6238392062078617, + "grad_norm": 1.4530819654464722, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8884779214859009, + "num_tokens": 786838191.0, + "step": 20626 + }, + { + "epoch": 2.6239664164864522, + "grad_norm": 1.5909008979797363, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8925260305404663, + "num_tokens": 786879629.0, + "step": 20627 + }, + { + "epoch": 2.6240936267650428, + "grad_norm": 1.449515700340271, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8920609951019287, + "num_tokens": 786922450.0, + "step": 20628 + }, + { + "epoch": 2.6242208370436333, + "grad_norm": 1.478190541267395, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8892151117324829, + "num_tokens": 786960943.0, + "step": 20629 + }, + { + "epoch": 2.624348047322224, + "grad_norm": 1.5704455375671387, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8734493851661682, + "num_tokens": 787000289.0, + "step": 20630 + }, + { + "epoch": 2.624475257600814, + "grad_norm": 1.6843327283859253, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8775773048400879, + "num_tokens": 787036535.0, + "step": 20631 + }, + { + "epoch": 2.624602467879405, + "grad_norm": 1.4780601263046265, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8911473751068115, + "num_tokens": 787073822.0, + "step": 20632 + }, + { + "epoch": 2.624729678157995, + "grad_norm": 1.5174269676208496, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8887070417404175, + "num_tokens": 787114739.0, + "step": 20633 + }, + { + "epoch": 2.624856888436586, + "grad_norm": 1.6021524667739868, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8803330659866333, + "num_tokens": 787150828.0, + "step": 20634 + }, + { + "epoch": 2.624984098715176, + "grad_norm": 1.6165111064910889, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.88657546043396, + "num_tokens": 787186024.0, + "step": 20635 + }, + { + "epoch": 2.6251113089937665, + "grad_norm": 1.479383945465088, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8862676620483398, + "num_tokens": 787224433.0, + "step": 20636 + }, + { + "epoch": 2.625238519272357, + "grad_norm": 1.4469785690307617, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8901581764221191, + "num_tokens": 787264642.0, + "step": 20637 + }, + { + "epoch": 2.6253657295509476, + "grad_norm": 1.4552233219146729, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8841582536697388, + "num_tokens": 787308051.0, + "step": 20638 + }, + { + "epoch": 2.625492939829538, + "grad_norm": 1.5992481708526611, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8749664425849915, + "num_tokens": 787347998.0, + "step": 20639 + }, + { + "epoch": 2.6256201501081287, + "grad_norm": 1.4606305360794067, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8850463628768921, + "num_tokens": 787389179.0, + "step": 20640 + }, + { + "epoch": 2.625747360386719, + "grad_norm": 1.6215097904205322, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.883908212184906, + "num_tokens": 787426472.0, + "step": 20641 + }, + { + "epoch": 2.6258745706653097, + "grad_norm": 1.6483612060546875, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8912447690963745, + "num_tokens": 787460081.0, + "step": 20642 + }, + { + "epoch": 2.6260017809439002, + "grad_norm": 1.480237364768982, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8868441581726074, + "num_tokens": 787501088.0, + "step": 20643 + }, + { + "epoch": 2.6261289912224908, + "grad_norm": 1.4537242650985718, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8854353427886963, + "num_tokens": 787540073.0, + "step": 20644 + }, + { + "epoch": 2.6262562015010813, + "grad_norm": 1.5125373601913452, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.892339825630188, + "num_tokens": 787581658.0, + "step": 20645 + }, + { + "epoch": 2.626383411779672, + "grad_norm": 1.522953987121582, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8752470016479492, + "num_tokens": 787624371.0, + "step": 20646 + }, + { + "epoch": 2.6265106220582624, + "grad_norm": 1.542054295539856, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8890655040740967, + "num_tokens": 787660782.0, + "step": 20647 + }, + { + "epoch": 2.626637832336853, + "grad_norm": 1.5277987718582153, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8929893970489502, + "num_tokens": 787701389.0, + "step": 20648 + }, + { + "epoch": 2.6267650426154434, + "grad_norm": 1.4226850271224976, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8843453526496887, + "num_tokens": 787745522.0, + "step": 20649 + }, + { + "epoch": 2.626892252894034, + "grad_norm": 1.510992407798767, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8879084587097168, + "num_tokens": 787786743.0, + "step": 20650 + }, + { + "epoch": 2.6270194631726245, + "grad_norm": 1.6671700477600098, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8686192035675049, + "num_tokens": 787822748.0, + "step": 20651 + }, + { + "epoch": 2.627146673451215, + "grad_norm": 1.4486075639724731, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8913480639457703, + "num_tokens": 787862482.0, + "step": 20652 + }, + { + "epoch": 2.6272738837298055, + "grad_norm": 1.5749213695526123, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8903101682662964, + "num_tokens": 787896819.0, + "step": 20653 + }, + { + "epoch": 2.6274010940083956, + "grad_norm": 1.6749703884124756, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8777679204940796, + "num_tokens": 787932032.0, + "step": 20654 + }, + { + "epoch": 2.6275283042869866, + "grad_norm": 1.519515872001648, + "learning_rate": 1e-06, + "loss": 0.2722, + "mean_token_accuracy": 0.8989153504371643, + "num_tokens": 787968085.0, + "step": 20655 + }, + { + "epoch": 2.6276555145655767, + "grad_norm": 1.4797937870025635, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8971360325813293, + "num_tokens": 788005456.0, + "step": 20656 + }, + { + "epoch": 2.6277827248441676, + "grad_norm": 1.567521333694458, + "learning_rate": 1e-06, + "loss": 0.272, + "mean_token_accuracy": 0.8997297883033752, + "num_tokens": 788042641.0, + "step": 20657 + }, + { + "epoch": 2.6279099351227577, + "grad_norm": 1.57784903049469, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8895685076713562, + "num_tokens": 788079242.0, + "step": 20658 + }, + { + "epoch": 2.6280371454013487, + "grad_norm": 1.7363476753234863, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8800582885742188, + "num_tokens": 788111585.0, + "step": 20659 + }, + { + "epoch": 2.628164355679939, + "grad_norm": 1.6090961694717407, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.88585364818573, + "num_tokens": 788149460.0, + "step": 20660 + }, + { + "epoch": 2.6282915659585293, + "grad_norm": 1.4560863971710205, + "learning_rate": 1e-06, + "loss": 0.2729, + "mean_token_accuracy": 0.9002617001533508, + "num_tokens": 788189707.0, + "step": 20661 + }, + { + "epoch": 2.62841877623712, + "grad_norm": 1.7647093534469604, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8812521696090698, + "num_tokens": 788227811.0, + "step": 20662 + }, + { + "epoch": 2.6285459865157104, + "grad_norm": 1.6387859582901, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8826676607131958, + "num_tokens": 788263407.0, + "step": 20663 + }, + { + "epoch": 2.628673196794301, + "grad_norm": 1.730829119682312, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8864220380783081, + "num_tokens": 788295903.0, + "step": 20664 + }, + { + "epoch": 2.6288004070728914, + "grad_norm": 1.6082725524902344, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8749203681945801, + "num_tokens": 788333923.0, + "step": 20665 + }, + { + "epoch": 2.628927617351482, + "grad_norm": 1.6261388063430786, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8787339925765991, + "num_tokens": 788371156.0, + "step": 20666 + }, + { + "epoch": 2.6290548276300725, + "grad_norm": 1.528437614440918, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8836643695831299, + "num_tokens": 788411698.0, + "step": 20667 + }, + { + "epoch": 2.629182037908663, + "grad_norm": 1.6658837795257568, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8873646259307861, + "num_tokens": 788451126.0, + "step": 20668 + }, + { + "epoch": 2.6293092481872535, + "grad_norm": 1.6902216672897339, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8846702575683594, + "num_tokens": 788483781.0, + "step": 20669 + }, + { + "epoch": 2.629436458465844, + "grad_norm": 1.6350854635238647, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.879209578037262, + "num_tokens": 788517256.0, + "step": 20670 + }, + { + "epoch": 2.6295636687444346, + "grad_norm": 1.6797221899032593, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8747881650924683, + "num_tokens": 788551142.0, + "step": 20671 + }, + { + "epoch": 2.629690879023025, + "grad_norm": 1.5704065561294556, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8775975108146667, + "num_tokens": 788588200.0, + "step": 20672 + }, + { + "epoch": 2.6298180893016156, + "grad_norm": 1.6779451370239258, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8769299387931824, + "num_tokens": 788623158.0, + "step": 20673 + }, + { + "epoch": 2.629945299580206, + "grad_norm": 1.5917762517929077, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8772915601730347, + "num_tokens": 788661715.0, + "step": 20674 + }, + { + "epoch": 2.6300725098587967, + "grad_norm": 1.5148594379425049, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8790413737297058, + "num_tokens": 788703285.0, + "step": 20675 + }, + { + "epoch": 2.6301997201373872, + "grad_norm": 1.480438470840454, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8895567059516907, + "num_tokens": 788743803.0, + "step": 20676 + }, + { + "epoch": 2.6303269304159778, + "grad_norm": 1.4895594120025635, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.890281081199646, + "num_tokens": 788782351.0, + "step": 20677 + }, + { + "epoch": 2.6304541406945683, + "grad_norm": 1.5971800088882446, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8779588937759399, + "num_tokens": 788823389.0, + "step": 20678 + }, + { + "epoch": 2.6305813509731584, + "grad_norm": 1.656376838684082, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8804490566253662, + "num_tokens": 788860263.0, + "step": 20679 + }, + { + "epoch": 2.6307085612517493, + "grad_norm": 1.5698312520980835, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8912639617919922, + "num_tokens": 788898898.0, + "step": 20680 + }, + { + "epoch": 2.6308357715303394, + "grad_norm": 1.6465277671813965, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8859376907348633, + "num_tokens": 788935992.0, + "step": 20681 + }, + { + "epoch": 2.6309629818089304, + "grad_norm": 1.4538995027542114, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8886576890945435, + "num_tokens": 788979259.0, + "step": 20682 + }, + { + "epoch": 2.6310901920875205, + "grad_norm": 1.662592887878418, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8842429518699646, + "num_tokens": 789017770.0, + "step": 20683 + }, + { + "epoch": 2.6312174023661115, + "grad_norm": 1.5165859460830688, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8824859857559204, + "num_tokens": 789057120.0, + "step": 20684 + }, + { + "epoch": 2.6313446126447015, + "grad_norm": 1.5400656461715698, + "learning_rate": 1e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.8975570797920227, + "num_tokens": 789093060.0, + "step": 20685 + }, + { + "epoch": 2.631471822923292, + "grad_norm": 1.6992906332015991, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.879551887512207, + "num_tokens": 789126585.0, + "step": 20686 + }, + { + "epoch": 2.6315990332018826, + "grad_norm": 1.5641947984695435, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8929461240768433, + "num_tokens": 789165868.0, + "step": 20687 + }, + { + "epoch": 2.631726243480473, + "grad_norm": 1.5095304250717163, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8875452280044556, + "num_tokens": 789206082.0, + "step": 20688 + }, + { + "epoch": 2.6318534537590637, + "grad_norm": 1.4674780368804932, + "learning_rate": 1e-06, + "loss": 0.2838, + "mean_token_accuracy": 0.8962780237197876, + "num_tokens": 789242656.0, + "step": 20689 + }, + { + "epoch": 2.631980664037654, + "grad_norm": 1.6074013710021973, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8890262246131897, + "num_tokens": 789280138.0, + "step": 20690 + }, + { + "epoch": 2.6321078743162447, + "grad_norm": 1.47921621799469, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8962850570678711, + "num_tokens": 789322513.0, + "step": 20691 + }, + { + "epoch": 2.6322350845948352, + "grad_norm": 1.624577522277832, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8875939846038818, + "num_tokens": 789359992.0, + "step": 20692 + }, + { + "epoch": 2.6323622948734258, + "grad_norm": 1.542451024055481, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8859231472015381, + "num_tokens": 789400670.0, + "step": 20693 + }, + { + "epoch": 2.6324895051520163, + "grad_norm": 1.476271390914917, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8827726244926453, + "num_tokens": 789449027.0, + "step": 20694 + }, + { + "epoch": 2.632616715430607, + "grad_norm": 1.401091456413269, + "learning_rate": 1e-06, + "loss": 0.2786, + "mean_token_accuracy": 0.8993527293205261, + "num_tokens": 789490711.0, + "step": 20695 + }, + { + "epoch": 2.6327439257091974, + "grad_norm": 1.5525931119918823, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8914563059806824, + "num_tokens": 789528083.0, + "step": 20696 + }, + { + "epoch": 2.632871135987788, + "grad_norm": 1.4750614166259766, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.894041895866394, + "num_tokens": 789570029.0, + "step": 20697 + }, + { + "epoch": 2.6329983462663784, + "grad_norm": 1.6502549648284912, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8909059166908264, + "num_tokens": 789605576.0, + "step": 20698 + }, + { + "epoch": 2.633125556544969, + "grad_norm": 1.7200652360916138, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.883350133895874, + "num_tokens": 789640143.0, + "step": 20699 + }, + { + "epoch": 2.6332527668235595, + "grad_norm": 1.5198973417282104, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8804918527603149, + "num_tokens": 789683672.0, + "step": 20700 + }, + { + "epoch": 2.63337997710215, + "grad_norm": 1.7943850755691528, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8869994878768921, + "num_tokens": 789716761.0, + "step": 20701 + }, + { + "epoch": 2.6335071873807405, + "grad_norm": 1.6189749240875244, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8836854696273804, + "num_tokens": 789751959.0, + "step": 20702 + }, + { + "epoch": 2.633634397659331, + "grad_norm": 1.5849549770355225, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8794561624526978, + "num_tokens": 789792202.0, + "step": 20703 + }, + { + "epoch": 2.633761607937921, + "grad_norm": 1.5405060052871704, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8805195093154907, + "num_tokens": 789832747.0, + "step": 20704 + }, + { + "epoch": 2.633888818216512, + "grad_norm": 1.7328166961669922, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8749440312385559, + "num_tokens": 789866363.0, + "step": 20705 + }, + { + "epoch": 2.634016028495102, + "grad_norm": 1.7040433883666992, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8642928004264832, + "num_tokens": 789902502.0, + "step": 20706 + }, + { + "epoch": 2.634143238773693, + "grad_norm": 1.555477499961853, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8814845085144043, + "num_tokens": 789939958.0, + "step": 20707 + }, + { + "epoch": 2.6342704490522832, + "grad_norm": 1.6057225465774536, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8757465481758118, + "num_tokens": 789977198.0, + "step": 20708 + }, + { + "epoch": 2.6343976593308738, + "grad_norm": 1.3627443313598633, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8951046466827393, + "num_tokens": 790019892.0, + "step": 20709 + }, + { + "epoch": 2.6345248696094643, + "grad_norm": 1.4921507835388184, + "learning_rate": 1e-06, + "loss": 0.2816, + "mean_token_accuracy": 0.8984007835388184, + "num_tokens": 790056344.0, + "step": 20710 + }, + { + "epoch": 2.634652079888055, + "grad_norm": 1.4174851179122925, + "learning_rate": 1e-06, + "loss": 0.2783, + "mean_token_accuracy": 0.8973692059516907, + "num_tokens": 790096504.0, + "step": 20711 + }, + { + "epoch": 2.6347792901666454, + "grad_norm": 1.662161946296692, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8791217803955078, + "num_tokens": 790131669.0, + "step": 20712 + }, + { + "epoch": 2.634906500445236, + "grad_norm": 1.5063292980194092, + "learning_rate": 1e-06, + "loss": 0.2683, + "mean_token_accuracy": 0.9020766019821167, + "num_tokens": 790167250.0, + "step": 20713 + }, + { + "epoch": 2.6350337107238264, + "grad_norm": 1.4714381694793701, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8771394491195679, + "num_tokens": 790213100.0, + "step": 20714 + }, + { + "epoch": 2.635160921002417, + "grad_norm": 1.4416508674621582, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8909786343574524, + "num_tokens": 790253572.0, + "step": 20715 + }, + { + "epoch": 2.6352881312810075, + "grad_norm": 1.5846284627914429, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8895530104637146, + "num_tokens": 790287640.0, + "step": 20716 + }, + { + "epoch": 2.635415341559598, + "grad_norm": 1.594893455505371, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.875280499458313, + "num_tokens": 790324460.0, + "step": 20717 + }, + { + "epoch": 2.6355425518381885, + "grad_norm": 1.576299786567688, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8813729286193848, + "num_tokens": 790366020.0, + "step": 20718 + }, + { + "epoch": 2.635669762116779, + "grad_norm": 1.51412034034729, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8804082870483398, + "num_tokens": 790405502.0, + "step": 20719 + }, + { + "epoch": 2.6357969723953696, + "grad_norm": 1.5903874635696411, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8785600662231445, + "num_tokens": 790441206.0, + "step": 20720 + }, + { + "epoch": 2.63592418267396, + "grad_norm": 1.5909446477890015, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8840371370315552, + "num_tokens": 790477447.0, + "step": 20721 + }, + { + "epoch": 2.6360513929525506, + "grad_norm": 1.673767328262329, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8843762278556824, + "num_tokens": 790510448.0, + "step": 20722 + }, + { + "epoch": 2.636178603231141, + "grad_norm": 1.505668044090271, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8932898640632629, + "num_tokens": 790548548.0, + "step": 20723 + }, + { + "epoch": 2.6363058135097317, + "grad_norm": 1.609106183052063, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8762272596359253, + "num_tokens": 790584453.0, + "step": 20724 + }, + { + "epoch": 2.6364330237883222, + "grad_norm": 1.589723825454712, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8779266476631165, + "num_tokens": 790622251.0, + "step": 20725 + }, + { + "epoch": 2.6365602340669128, + "grad_norm": 1.5774073600769043, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8919968605041504, + "num_tokens": 790657930.0, + "step": 20726 + }, + { + "epoch": 2.636687444345503, + "grad_norm": 1.6078439950942993, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8731997609138489, + "num_tokens": 790698289.0, + "step": 20727 + }, + { + "epoch": 2.636814654624094, + "grad_norm": 1.467917799949646, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8852981328964233, + "num_tokens": 790742489.0, + "step": 20728 + }, + { + "epoch": 2.636941864902684, + "grad_norm": 1.5094354152679443, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8856433629989624, + "num_tokens": 790781506.0, + "step": 20729 + }, + { + "epoch": 2.637069075181275, + "grad_norm": 1.6747584342956543, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8853853344917297, + "num_tokens": 790817102.0, + "step": 20730 + }, + { + "epoch": 2.637196285459865, + "grad_norm": 1.6976786851882935, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8646115064620972, + "num_tokens": 790860092.0, + "step": 20731 + }, + { + "epoch": 2.637323495738456, + "grad_norm": 1.6819630861282349, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8818911910057068, + "num_tokens": 790896507.0, + "step": 20732 + }, + { + "epoch": 2.637450706017046, + "grad_norm": 1.6410105228424072, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8851659297943115, + "num_tokens": 790936238.0, + "step": 20733 + }, + { + "epoch": 2.6375779162956365, + "grad_norm": 1.4940646886825562, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8826732635498047, + "num_tokens": 790977330.0, + "step": 20734 + }, + { + "epoch": 2.637705126574227, + "grad_norm": 1.6837724447250366, + "learning_rate": 1e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.9000996947288513, + "num_tokens": 791007858.0, + "step": 20735 + }, + { + "epoch": 2.6378323368528176, + "grad_norm": 1.531546711921692, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8775200247764587, + "num_tokens": 791052074.0, + "step": 20736 + }, + { + "epoch": 2.637959547131408, + "grad_norm": 1.5090069770812988, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8925313949584961, + "num_tokens": 791093809.0, + "step": 20737 + }, + { + "epoch": 2.6380867574099987, + "grad_norm": 1.62299644947052, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8809405565261841, + "num_tokens": 791130610.0, + "step": 20738 + }, + { + "epoch": 2.638213967688589, + "grad_norm": 1.7706547975540161, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8823960423469543, + "num_tokens": 791167682.0, + "step": 20739 + }, + { + "epoch": 2.6383411779671797, + "grad_norm": 1.6066417694091797, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8754405379295349, + "num_tokens": 791205329.0, + "step": 20740 + }, + { + "epoch": 2.6384683882457702, + "grad_norm": 1.4384825229644775, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.889798104763031, + "num_tokens": 791248393.0, + "step": 20741 + }, + { + "epoch": 2.6385955985243608, + "grad_norm": 1.4982246160507202, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8824530839920044, + "num_tokens": 791288862.0, + "step": 20742 + }, + { + "epoch": 2.6387228088029513, + "grad_norm": 1.3995296955108643, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8948434591293335, + "num_tokens": 791330031.0, + "step": 20743 + }, + { + "epoch": 2.638850019081542, + "grad_norm": 1.6280080080032349, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8926123976707458, + "num_tokens": 791363478.0, + "step": 20744 + }, + { + "epoch": 2.6389772293601323, + "grad_norm": 1.5274358987808228, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8820151090621948, + "num_tokens": 791406483.0, + "step": 20745 + }, + { + "epoch": 2.639104439638723, + "grad_norm": 1.494957685470581, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8929051160812378, + "num_tokens": 791449857.0, + "step": 20746 + }, + { + "epoch": 2.6392316499173134, + "grad_norm": 1.575685977935791, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8875793814659119, + "num_tokens": 791484948.0, + "step": 20747 + }, + { + "epoch": 2.639358860195904, + "grad_norm": 1.5626577138900757, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8829272389411926, + "num_tokens": 791525155.0, + "step": 20748 + }, + { + "epoch": 2.6394860704744945, + "grad_norm": 1.656481146812439, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8929386138916016, + "num_tokens": 791560812.0, + "step": 20749 + }, + { + "epoch": 2.639613280753085, + "grad_norm": 1.5149306058883667, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8846225738525391, + "num_tokens": 791602773.0, + "step": 20750 + }, + { + "epoch": 2.6397404910316755, + "grad_norm": 1.5266958475112915, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8913877010345459, + "num_tokens": 791641234.0, + "step": 20751 + }, + { + "epoch": 2.6398677013102656, + "grad_norm": 1.4559208154678345, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8947670459747314, + "num_tokens": 791682041.0, + "step": 20752 + }, + { + "epoch": 2.6399949115888566, + "grad_norm": 1.433683156967163, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8977657556533813, + "num_tokens": 791726127.0, + "step": 20753 + }, + { + "epoch": 2.6401221218674467, + "grad_norm": 1.652618408203125, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8764557838439941, + "num_tokens": 791760569.0, + "step": 20754 + }, + { + "epoch": 2.6402493321460376, + "grad_norm": 1.401038646697998, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8882412910461426, + "num_tokens": 791805559.0, + "step": 20755 + }, + { + "epoch": 2.6403765424246277, + "grad_norm": 1.604244589805603, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8873152136802673, + "num_tokens": 791842566.0, + "step": 20756 + }, + { + "epoch": 2.6405037527032187, + "grad_norm": 1.7120238542556763, + "learning_rate": 1e-06, + "loss": 0.2602, + "mean_token_accuracy": 0.9033487439155579, + "num_tokens": 791871837.0, + "step": 20757 + }, + { + "epoch": 2.6406309629818088, + "grad_norm": 1.5686129331588745, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8751435279846191, + "num_tokens": 791907538.0, + "step": 20758 + }, + { + "epoch": 2.6407581732603993, + "grad_norm": 1.6358404159545898, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8724792003631592, + "num_tokens": 791945907.0, + "step": 20759 + }, + { + "epoch": 2.64088538353899, + "grad_norm": 1.4473843574523926, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8954312801361084, + "num_tokens": 791985975.0, + "step": 20760 + }, + { + "epoch": 2.6410125938175804, + "grad_norm": 1.6562577486038208, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8853946924209595, + "num_tokens": 792022427.0, + "step": 20761 + }, + { + "epoch": 2.641139804096171, + "grad_norm": 1.4800896644592285, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8918565511703491, + "num_tokens": 792061665.0, + "step": 20762 + }, + { + "epoch": 2.6412670143747614, + "grad_norm": 1.7144274711608887, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8879883289337158, + "num_tokens": 792093217.0, + "step": 20763 + }, + { + "epoch": 2.641394224653352, + "grad_norm": 1.6193345785140991, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.887640118598938, + "num_tokens": 792127604.0, + "step": 20764 + }, + { + "epoch": 2.6415214349319425, + "grad_norm": 1.5353933572769165, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8950742483139038, + "num_tokens": 792162276.0, + "step": 20765 + }, + { + "epoch": 2.641648645210533, + "grad_norm": 1.479943871498108, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8878512382507324, + "num_tokens": 792203155.0, + "step": 20766 + }, + { + "epoch": 2.6417758554891235, + "grad_norm": 1.403257966041565, + "learning_rate": 1e-06, + "loss": 0.274, + "mean_token_accuracy": 0.899631917476654, + "num_tokens": 792243543.0, + "step": 20767 + }, + { + "epoch": 2.641903065767714, + "grad_norm": 1.5856446027755737, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8857957124710083, + "num_tokens": 792277857.0, + "step": 20768 + }, + { + "epoch": 2.6420302760463046, + "grad_norm": 1.4582006931304932, + "learning_rate": 1e-06, + "loss": 0.2589, + "mean_token_accuracy": 0.9036916494369507, + "num_tokens": 792317246.0, + "step": 20769 + }, + { + "epoch": 2.642157486324895, + "grad_norm": 1.7402195930480957, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8729190826416016, + "num_tokens": 792352070.0, + "step": 20770 + }, + { + "epoch": 2.6422846966034856, + "grad_norm": 1.4220373630523682, + "learning_rate": 1e-06, + "loss": 0.2724, + "mean_token_accuracy": 0.9001498222351074, + "num_tokens": 792389391.0, + "step": 20771 + }, + { + "epoch": 2.642411906882076, + "grad_norm": 1.463904857635498, + "learning_rate": 1e-06, + "loss": 0.2805, + "mean_token_accuracy": 0.8996972441673279, + "num_tokens": 792430686.0, + "step": 20772 + }, + { + "epoch": 2.6425391171606667, + "grad_norm": 1.6151357889175415, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8831614851951599, + "num_tokens": 792467809.0, + "step": 20773 + }, + { + "epoch": 2.6426663274392572, + "grad_norm": 1.464064359664917, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8836996555328369, + "num_tokens": 792509690.0, + "step": 20774 + }, + { + "epoch": 2.6427935377178478, + "grad_norm": 1.3885186910629272, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8854451775550842, + "num_tokens": 792555706.0, + "step": 20775 + }, + { + "epoch": 2.6429207479964383, + "grad_norm": 1.5516469478607178, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8824795484542847, + "num_tokens": 792599272.0, + "step": 20776 + }, + { + "epoch": 2.6430479582750284, + "grad_norm": 1.5384175777435303, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8854762315750122, + "num_tokens": 792637049.0, + "step": 20777 + }, + { + "epoch": 2.6431751685536193, + "grad_norm": 1.6270065307617188, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8857395648956299, + "num_tokens": 792671373.0, + "step": 20778 + }, + { + "epoch": 2.6433023788322094, + "grad_norm": 1.604250431060791, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8788732886314392, + "num_tokens": 792712198.0, + "step": 20779 + }, + { + "epoch": 2.6434295891108004, + "grad_norm": 1.6046898365020752, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8714854717254639, + "num_tokens": 792751400.0, + "step": 20780 + }, + { + "epoch": 2.6435567993893905, + "grad_norm": 1.468957543373108, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8775626420974731, + "num_tokens": 792797175.0, + "step": 20781 + }, + { + "epoch": 2.6436840096679814, + "grad_norm": 1.5687665939331055, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.88878333568573, + "num_tokens": 792835972.0, + "step": 20782 + }, + { + "epoch": 2.6438112199465715, + "grad_norm": 1.5421346426010132, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8834062814712524, + "num_tokens": 792873211.0, + "step": 20783 + }, + { + "epoch": 2.643938430225162, + "grad_norm": 1.5411397218704224, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8747075796127319, + "num_tokens": 792914680.0, + "step": 20784 + }, + { + "epoch": 2.6440656405037526, + "grad_norm": 1.5737040042877197, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8952526450157166, + "num_tokens": 792954431.0, + "step": 20785 + }, + { + "epoch": 2.644192850782343, + "grad_norm": 1.5424591302871704, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8868720531463623, + "num_tokens": 792992085.0, + "step": 20786 + }, + { + "epoch": 2.6443200610609336, + "grad_norm": 1.541359782218933, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8715032935142517, + "num_tokens": 793032598.0, + "step": 20787 + }, + { + "epoch": 2.644447271339524, + "grad_norm": 1.675653100013733, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8800786137580872, + "num_tokens": 793065060.0, + "step": 20788 + }, + { + "epoch": 2.6445744816181147, + "grad_norm": 1.4691606760025024, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8729338049888611, + "num_tokens": 793108654.0, + "step": 20789 + }, + { + "epoch": 2.6447016918967052, + "grad_norm": 1.5055956840515137, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8920882344245911, + "num_tokens": 793146015.0, + "step": 20790 + }, + { + "epoch": 2.6448289021752958, + "grad_norm": 1.7546902894973755, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8706042170524597, + "num_tokens": 793180684.0, + "step": 20791 + }, + { + "epoch": 2.6449561124538863, + "grad_norm": 1.5231189727783203, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8880552649497986, + "num_tokens": 793217204.0, + "step": 20792 + }, + { + "epoch": 2.645083322732477, + "grad_norm": 1.7145676612854004, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8766716122627258, + "num_tokens": 793255577.0, + "step": 20793 + }, + { + "epoch": 2.6452105330110673, + "grad_norm": 1.4697368144989014, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8924945592880249, + "num_tokens": 793294283.0, + "step": 20794 + }, + { + "epoch": 2.645337743289658, + "grad_norm": 1.5668662786483765, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8835705518722534, + "num_tokens": 793331187.0, + "step": 20795 + }, + { + "epoch": 2.6454649535682484, + "grad_norm": 1.6464290618896484, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8746731281280518, + "num_tokens": 793367675.0, + "step": 20796 + }, + { + "epoch": 2.645592163846839, + "grad_norm": 1.5720551013946533, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8708559274673462, + "num_tokens": 793407907.0, + "step": 20797 + }, + { + "epoch": 2.6457193741254295, + "grad_norm": 1.6352628469467163, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8834807872772217, + "num_tokens": 793442670.0, + "step": 20798 + }, + { + "epoch": 2.64584658440402, + "grad_norm": 1.486073613166809, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8772047162055969, + "num_tokens": 793487658.0, + "step": 20799 + }, + { + "epoch": 2.6459737946826105, + "grad_norm": 1.8363538980484009, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8772117495536804, + "num_tokens": 793517761.0, + "step": 20800 + }, + { + "epoch": 2.646101004961201, + "grad_norm": 1.4720336198806763, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8833612203598022, + "num_tokens": 793559218.0, + "step": 20801 + }, + { + "epoch": 2.646228215239791, + "grad_norm": 1.4816246032714844, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8819526433944702, + "num_tokens": 793602003.0, + "step": 20802 + }, + { + "epoch": 2.646355425518382, + "grad_norm": 1.670157790184021, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8850018978118896, + "num_tokens": 793634320.0, + "step": 20803 + }, + { + "epoch": 2.646482635796972, + "grad_norm": 1.427603840827942, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8841066360473633, + "num_tokens": 793677056.0, + "step": 20804 + }, + { + "epoch": 2.646609846075563, + "grad_norm": 1.630579948425293, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8859081864356995, + "num_tokens": 793710454.0, + "step": 20805 + }, + { + "epoch": 2.6467370563541532, + "grad_norm": 1.5732861757278442, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.8953686356544495, + "num_tokens": 793744995.0, + "step": 20806 + }, + { + "epoch": 2.6468642666327438, + "grad_norm": 1.544189691543579, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8850610852241516, + "num_tokens": 793782970.0, + "step": 20807 + }, + { + "epoch": 2.6469914769113343, + "grad_norm": 1.4332317113876343, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8879227042198181, + "num_tokens": 793824665.0, + "step": 20808 + }, + { + "epoch": 2.647118687189925, + "grad_norm": 1.647484540939331, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8858498334884644, + "num_tokens": 793858505.0, + "step": 20809 + }, + { + "epoch": 2.6472458974685154, + "grad_norm": 1.5327068567276, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8730263113975525, + "num_tokens": 793899166.0, + "step": 20810 + }, + { + "epoch": 2.647373107747106, + "grad_norm": 1.5846493244171143, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.888375997543335, + "num_tokens": 793933730.0, + "step": 20811 + }, + { + "epoch": 2.6475003180256964, + "grad_norm": 1.5263473987579346, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8829675912857056, + "num_tokens": 793974907.0, + "step": 20812 + }, + { + "epoch": 2.647627528304287, + "grad_norm": 1.465444564819336, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.883285403251648, + "num_tokens": 794017975.0, + "step": 20813 + }, + { + "epoch": 2.6477547385828775, + "grad_norm": 1.5091795921325684, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8807305097579956, + "num_tokens": 794059852.0, + "step": 20814 + }, + { + "epoch": 2.647881948861468, + "grad_norm": 1.4705816507339478, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8840290307998657, + "num_tokens": 794100200.0, + "step": 20815 + }, + { + "epoch": 2.6480091591400585, + "grad_norm": 1.407135248184204, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.892145037651062, + "num_tokens": 794143327.0, + "step": 20816 + }, + { + "epoch": 2.648136369418649, + "grad_norm": 1.5718480348587036, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8830863833427429, + "num_tokens": 794182605.0, + "step": 20817 + }, + { + "epoch": 2.6482635796972396, + "grad_norm": 1.5299702882766724, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8913217186927795, + "num_tokens": 794220044.0, + "step": 20818 + }, + { + "epoch": 2.64839078997583, + "grad_norm": 1.4742530584335327, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8772135972976685, + "num_tokens": 794266427.0, + "step": 20819 + }, + { + "epoch": 2.6485180002544206, + "grad_norm": 1.4352658987045288, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8910684585571289, + "num_tokens": 794308623.0, + "step": 20820 + }, + { + "epoch": 2.648645210533011, + "grad_norm": 1.6139744520187378, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8818151950836182, + "num_tokens": 794348908.0, + "step": 20821 + }, + { + "epoch": 2.6487724208116017, + "grad_norm": 1.4715405702590942, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8918907642364502, + "num_tokens": 794388495.0, + "step": 20822 + }, + { + "epoch": 2.648899631090192, + "grad_norm": 1.6193495988845825, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8856937289237976, + "num_tokens": 794422929.0, + "step": 20823 + }, + { + "epoch": 2.6490268413687827, + "grad_norm": 1.4176437854766846, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8829407691955566, + "num_tokens": 794464997.0, + "step": 20824 + }, + { + "epoch": 2.649154051647373, + "grad_norm": 1.803420901298523, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8788803815841675, + "num_tokens": 794496237.0, + "step": 20825 + }, + { + "epoch": 2.649281261925964, + "grad_norm": 1.4037017822265625, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8922315239906311, + "num_tokens": 794540397.0, + "step": 20826 + }, + { + "epoch": 2.649408472204554, + "grad_norm": 1.6704435348510742, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8876301050186157, + "num_tokens": 794575030.0, + "step": 20827 + }, + { + "epoch": 2.649535682483145, + "grad_norm": 1.6517720222473145, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8804205656051636, + "num_tokens": 794612085.0, + "step": 20828 + }, + { + "epoch": 2.649662892761735, + "grad_norm": 1.6900784969329834, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8777950406074524, + "num_tokens": 794648461.0, + "step": 20829 + }, + { + "epoch": 2.649790103040326, + "grad_norm": 1.5398567914962769, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8881807327270508, + "num_tokens": 794687476.0, + "step": 20830 + }, + { + "epoch": 2.649917313318916, + "grad_norm": 1.4884248971939087, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8736217617988586, + "num_tokens": 794731446.0, + "step": 20831 + }, + { + "epoch": 2.6500445235975065, + "grad_norm": 1.6973087787628174, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8822789788246155, + "num_tokens": 794762012.0, + "step": 20832 + }, + { + "epoch": 2.650171733876097, + "grad_norm": 1.477871298789978, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8939043283462524, + "num_tokens": 794799075.0, + "step": 20833 + }, + { + "epoch": 2.6502989441546876, + "grad_norm": 1.6925228834152222, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8853698968887329, + "num_tokens": 794832777.0, + "step": 20834 + }, + { + "epoch": 2.650426154433278, + "grad_norm": 1.6557804346084595, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8853361010551453, + "num_tokens": 794868274.0, + "step": 20835 + }, + { + "epoch": 2.6505533647118686, + "grad_norm": 1.5094349384307861, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.8961551189422607, + "num_tokens": 794905329.0, + "step": 20836 + }, + { + "epoch": 2.650680574990459, + "grad_norm": 1.6740992069244385, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8761887550354004, + "num_tokens": 794941340.0, + "step": 20837 + }, + { + "epoch": 2.6508077852690497, + "grad_norm": 1.511142611503601, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8874353766441345, + "num_tokens": 794980114.0, + "step": 20838 + }, + { + "epoch": 2.6509349955476402, + "grad_norm": 1.6980332136154175, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8765121698379517, + "num_tokens": 795016285.0, + "step": 20839 + }, + { + "epoch": 2.6510622058262308, + "grad_norm": 1.7030807733535767, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8788560628890991, + "num_tokens": 795052330.0, + "step": 20840 + }, + { + "epoch": 2.6511894161048213, + "grad_norm": 1.576197624206543, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8820550441741943, + "num_tokens": 795092724.0, + "step": 20841 + }, + { + "epoch": 2.651316626383412, + "grad_norm": 1.5874559879302979, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8850498199462891, + "num_tokens": 795130298.0, + "step": 20842 + }, + { + "epoch": 2.6514438366620023, + "grad_norm": 1.49888014793396, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8874987959861755, + "num_tokens": 795169053.0, + "step": 20843 + }, + { + "epoch": 2.651571046940593, + "grad_norm": 1.499709963798523, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8825688362121582, + "num_tokens": 795210871.0, + "step": 20844 + }, + { + "epoch": 2.6516982572191834, + "grad_norm": 1.5046333074569702, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8764032125473022, + "num_tokens": 795251926.0, + "step": 20845 + }, + { + "epoch": 2.651825467497774, + "grad_norm": 1.4502747058868408, + "learning_rate": 1e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.8930029273033142, + "num_tokens": 795293258.0, + "step": 20846 + }, + { + "epoch": 2.6519526777763645, + "grad_norm": 1.479742169380188, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8938700556755066, + "num_tokens": 795330965.0, + "step": 20847 + }, + { + "epoch": 2.652079888054955, + "grad_norm": 1.5586456060409546, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8831573724746704, + "num_tokens": 795372906.0, + "step": 20848 + }, + { + "epoch": 2.6522070983335455, + "grad_norm": 1.7002227306365967, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.885749340057373, + "num_tokens": 795408582.0, + "step": 20849 + }, + { + "epoch": 2.6523343086121356, + "grad_norm": 1.7032926082611084, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8847442865371704, + "num_tokens": 795441524.0, + "step": 20850 + }, + { + "epoch": 2.6524615188907266, + "grad_norm": 1.4730503559112549, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8704233169555664, + "num_tokens": 795487987.0, + "step": 20851 + }, + { + "epoch": 2.6525887291693167, + "grad_norm": 1.4123387336730957, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8724737763404846, + "num_tokens": 795534329.0, + "step": 20852 + }, + { + "epoch": 2.6527159394479076, + "grad_norm": 1.7225115299224854, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8832486867904663, + "num_tokens": 795567069.0, + "step": 20853 + }, + { + "epoch": 2.6528431497264977, + "grad_norm": 1.3949798345565796, + "learning_rate": 1e-06, + "loss": 0.2775, + "mean_token_accuracy": 0.8998629450798035, + "num_tokens": 795610513.0, + "step": 20854 + }, + { + "epoch": 2.6529703600050887, + "grad_norm": 1.6001596450805664, + "learning_rate": 1e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.8955515623092651, + "num_tokens": 795645197.0, + "step": 20855 + }, + { + "epoch": 2.6530975702836788, + "grad_norm": 1.487169623374939, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8869819641113281, + "num_tokens": 795684557.0, + "step": 20856 + }, + { + "epoch": 2.6532247805622693, + "grad_norm": 1.4531556367874146, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8882902264595032, + "num_tokens": 795725728.0, + "step": 20857 + }, + { + "epoch": 2.65335199084086, + "grad_norm": 1.5818686485290527, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8781441450119019, + "num_tokens": 795763169.0, + "step": 20858 + }, + { + "epoch": 2.6534792011194503, + "grad_norm": 1.6462470293045044, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8910451531410217, + "num_tokens": 795798943.0, + "step": 20859 + }, + { + "epoch": 2.653606411398041, + "grad_norm": 1.6333674192428589, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8898038864135742, + "num_tokens": 795836739.0, + "step": 20860 + }, + { + "epoch": 2.6537336216766314, + "grad_norm": 1.5180858373641968, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8770672082901001, + "num_tokens": 795878026.0, + "step": 20861 + }, + { + "epoch": 2.653860831955222, + "grad_norm": 1.5584089756011963, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.889494776725769, + "num_tokens": 795915431.0, + "step": 20862 + }, + { + "epoch": 2.6539880422338125, + "grad_norm": 1.5301865339279175, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.8940202593803406, + "num_tokens": 795954088.0, + "step": 20863 + }, + { + "epoch": 2.654115252512403, + "grad_norm": 1.4767215251922607, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.879694938659668, + "num_tokens": 795995692.0, + "step": 20864 + }, + { + "epoch": 2.6542424627909935, + "grad_norm": 1.6268702745437622, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8809197545051575, + "num_tokens": 796032547.0, + "step": 20865 + }, + { + "epoch": 2.654369673069584, + "grad_norm": 1.6122143268585205, + "learning_rate": 1e-06, + "loss": 0.2666, + "mean_token_accuracy": 0.9021527767181396, + "num_tokens": 796063920.0, + "step": 20866 + }, + { + "epoch": 2.6544968833481746, + "grad_norm": 1.5360541343688965, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.881943941116333, + "num_tokens": 796100674.0, + "step": 20867 + }, + { + "epoch": 2.654624093626765, + "grad_norm": 1.4220243692398071, + "learning_rate": 1e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.896229088306427, + "num_tokens": 796141766.0, + "step": 20868 + }, + { + "epoch": 2.6547513039053556, + "grad_norm": 1.4652674198150635, + "learning_rate": 1e-06, + "loss": 0.2727, + "mean_token_accuracy": 0.9045780897140503, + "num_tokens": 796180374.0, + "step": 20869 + }, + { + "epoch": 2.654878514183946, + "grad_norm": 1.6120223999023438, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8884199857711792, + "num_tokens": 796218388.0, + "step": 20870 + }, + { + "epoch": 2.6550057244625367, + "grad_norm": 1.6043317317962646, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8938018083572388, + "num_tokens": 796253199.0, + "step": 20871 + }, + { + "epoch": 2.655132934741127, + "grad_norm": 1.5056272745132446, + "learning_rate": 1e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.8955468535423279, + "num_tokens": 796292078.0, + "step": 20872 + }, + { + "epoch": 2.6552601450197177, + "grad_norm": 1.6089383363723755, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8765157461166382, + "num_tokens": 796331582.0, + "step": 20873 + }, + { + "epoch": 2.6553873552983083, + "grad_norm": 1.4742039442062378, + "learning_rate": 1e-06, + "loss": 0.2583, + "mean_token_accuracy": 0.9051142930984497, + "num_tokens": 796368624.0, + "step": 20874 + }, + { + "epoch": 2.6555145655768984, + "grad_norm": 1.5312325954437256, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8874112367630005, + "num_tokens": 796408885.0, + "step": 20875 + }, + { + "epoch": 2.6556417758554893, + "grad_norm": 1.6376819610595703, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8778399229049683, + "num_tokens": 796444751.0, + "step": 20876 + }, + { + "epoch": 2.6557689861340794, + "grad_norm": 1.578714370727539, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8910773992538452, + "num_tokens": 796482118.0, + "step": 20877 + }, + { + "epoch": 2.6558961964126704, + "grad_norm": 1.5112701654434204, + "learning_rate": 1e-06, + "loss": 0.2782, + "mean_token_accuracy": 0.9001811742782593, + "num_tokens": 796517886.0, + "step": 20878 + }, + { + "epoch": 2.6560234066912605, + "grad_norm": 1.4591377973556519, + "learning_rate": 1e-06, + "loss": 0.296, + "mean_token_accuracy": 0.8945834636688232, + "num_tokens": 796557093.0, + "step": 20879 + }, + { + "epoch": 2.6561506169698514, + "grad_norm": 1.5736839771270752, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.891043484210968, + "num_tokens": 796595041.0, + "step": 20880 + }, + { + "epoch": 2.6562778272484415, + "grad_norm": 1.4393390417099, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8878548741340637, + "num_tokens": 796637163.0, + "step": 20881 + }, + { + "epoch": 2.656405037527032, + "grad_norm": 1.5245615243911743, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8895249366760254, + "num_tokens": 796674978.0, + "step": 20882 + }, + { + "epoch": 2.6565322478056226, + "grad_norm": 1.4530730247497559, + "learning_rate": 1e-06, + "loss": 0.2729, + "mean_token_accuracy": 0.8992229104042053, + "num_tokens": 796713604.0, + "step": 20883 + }, + { + "epoch": 2.656659458084213, + "grad_norm": 1.5165215730667114, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8829572200775146, + "num_tokens": 796754974.0, + "step": 20884 + }, + { + "epoch": 2.6567866683628036, + "grad_norm": 1.5409399271011353, + "learning_rate": 1e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.899711549282074, + "num_tokens": 796792210.0, + "step": 20885 + }, + { + "epoch": 2.656913878641394, + "grad_norm": 1.6164182424545288, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8933361768722534, + "num_tokens": 796830670.0, + "step": 20886 + }, + { + "epoch": 2.6570410889199847, + "grad_norm": 1.5957552194595337, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.878524899482727, + "num_tokens": 796869724.0, + "step": 20887 + }, + { + "epoch": 2.6571682991985752, + "grad_norm": 1.4600913524627686, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8915965557098389, + "num_tokens": 796912356.0, + "step": 20888 + }, + { + "epoch": 2.6572955094771658, + "grad_norm": 1.5716718435287476, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8870376348495483, + "num_tokens": 796946922.0, + "step": 20889 + }, + { + "epoch": 2.6574227197557563, + "grad_norm": 1.4959540367126465, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8807826638221741, + "num_tokens": 796990881.0, + "step": 20890 + }, + { + "epoch": 2.657549930034347, + "grad_norm": 1.645400047302246, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8851768970489502, + "num_tokens": 797027629.0, + "step": 20891 + }, + { + "epoch": 2.6576771403129373, + "grad_norm": 1.6264744997024536, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8921743631362915, + "num_tokens": 797061695.0, + "step": 20892 + }, + { + "epoch": 2.657804350591528, + "grad_norm": 1.5883435010910034, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.894720196723938, + "num_tokens": 797097898.0, + "step": 20893 + }, + { + "epoch": 2.6579315608701184, + "grad_norm": 1.6960679292678833, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8737167716026306, + "num_tokens": 797133790.0, + "step": 20894 + }, + { + "epoch": 2.658058771148709, + "grad_norm": 1.6617711782455444, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.885430097579956, + "num_tokens": 797166931.0, + "step": 20895 + }, + { + "epoch": 2.6581859814272994, + "grad_norm": 1.5847854614257812, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8737719058990479, + "num_tokens": 797207302.0, + "step": 20896 + }, + { + "epoch": 2.65831319170589, + "grad_norm": 1.6837420463562012, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8803627490997314, + "num_tokens": 797238763.0, + "step": 20897 + }, + { + "epoch": 2.6584404019844805, + "grad_norm": 1.5713797807693481, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8847659826278687, + "num_tokens": 797275530.0, + "step": 20898 + }, + { + "epoch": 2.658567612263071, + "grad_norm": 1.5908585786819458, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8749430775642395, + "num_tokens": 797312658.0, + "step": 20899 + }, + { + "epoch": 2.658694822541661, + "grad_norm": 1.5414518117904663, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8868491053581238, + "num_tokens": 797350650.0, + "step": 20900 + }, + { + "epoch": 2.658822032820252, + "grad_norm": 1.3848810195922852, + "learning_rate": 1e-06, + "loss": 0.2688, + "mean_token_accuracy": 0.9035146236419678, + "num_tokens": 797390001.0, + "step": 20901 + }, + { + "epoch": 2.658949243098842, + "grad_norm": 1.6488736867904663, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8767824172973633, + "num_tokens": 797430030.0, + "step": 20902 + }, + { + "epoch": 2.659076453377433, + "grad_norm": 1.5423325300216675, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8780567049980164, + "num_tokens": 797469304.0, + "step": 20903 + }, + { + "epoch": 2.6592036636560232, + "grad_norm": 1.5253394842147827, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8896610736846924, + "num_tokens": 797506503.0, + "step": 20904 + }, + { + "epoch": 2.6593308739346138, + "grad_norm": 1.6751290559768677, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8831582069396973, + "num_tokens": 797543854.0, + "step": 20905 + }, + { + "epoch": 2.6594580842132043, + "grad_norm": 1.592653512954712, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8705109357833862, + "num_tokens": 797586515.0, + "step": 20906 + }, + { + "epoch": 2.659585294491795, + "grad_norm": 1.8168820142745972, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8779351711273193, + "num_tokens": 797621407.0, + "step": 20907 + }, + { + "epoch": 2.6597125047703853, + "grad_norm": 1.7218618392944336, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8726846575737, + "num_tokens": 797655456.0, + "step": 20908 + }, + { + "epoch": 2.659839715048976, + "grad_norm": 1.5874022245407104, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8795846104621887, + "num_tokens": 797694566.0, + "step": 20909 + }, + { + "epoch": 2.6599669253275664, + "grad_norm": 1.671533226966858, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8687087893486023, + "num_tokens": 797733378.0, + "step": 20910 + }, + { + "epoch": 2.660094135606157, + "grad_norm": 1.55476975440979, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8728982210159302, + "num_tokens": 797773430.0, + "step": 20911 + }, + { + "epoch": 2.6602213458847475, + "grad_norm": 1.494983196258545, + "learning_rate": 1e-06, + "loss": 0.2786, + "mean_token_accuracy": 0.9002402424812317, + "num_tokens": 797811491.0, + "step": 20912 + }, + { + "epoch": 2.660348556163338, + "grad_norm": 1.501768946647644, + "learning_rate": 1e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.9001587629318237, + "num_tokens": 797847629.0, + "step": 20913 + }, + { + "epoch": 2.6604757664419285, + "grad_norm": 1.458082914352417, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8827513456344604, + "num_tokens": 797890274.0, + "step": 20914 + }, + { + "epoch": 2.660602976720519, + "grad_norm": 1.6048550605773926, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8833760023117065, + "num_tokens": 797927899.0, + "step": 20915 + }, + { + "epoch": 2.6607301869991096, + "grad_norm": 1.5326104164123535, + "learning_rate": 1e-06, + "loss": 0.2731, + "mean_token_accuracy": 0.9007784724235535, + "num_tokens": 797964979.0, + "step": 20916 + }, + { + "epoch": 2.6608573972777, + "grad_norm": 1.7152057886123657, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8775347471237183, + "num_tokens": 797997438.0, + "step": 20917 + }, + { + "epoch": 2.6609846075562906, + "grad_norm": 1.533659815788269, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8844015598297119, + "num_tokens": 798035282.0, + "step": 20918 + }, + { + "epoch": 2.661111817834881, + "grad_norm": 1.5576512813568115, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8829618692398071, + "num_tokens": 798074530.0, + "step": 20919 + }, + { + "epoch": 2.6612390281134717, + "grad_norm": 1.6303836107254028, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8787333369255066, + "num_tokens": 798114328.0, + "step": 20920 + }, + { + "epoch": 2.661366238392062, + "grad_norm": 1.592469334602356, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8854900002479553, + "num_tokens": 798149152.0, + "step": 20921 + }, + { + "epoch": 2.6614934486706527, + "grad_norm": 1.5070586204528809, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8836755156517029, + "num_tokens": 798187247.0, + "step": 20922 + }, + { + "epoch": 2.661620658949243, + "grad_norm": 1.592206358909607, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8819608688354492, + "num_tokens": 798224377.0, + "step": 20923 + }, + { + "epoch": 2.661747869227834, + "grad_norm": 1.6831058263778687, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.866751492023468, + "num_tokens": 798265669.0, + "step": 20924 + }, + { + "epoch": 2.661875079506424, + "grad_norm": 1.525801420211792, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8847699761390686, + "num_tokens": 798304732.0, + "step": 20925 + }, + { + "epoch": 2.662002289785015, + "grad_norm": 1.650770902633667, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8899276256561279, + "num_tokens": 798339599.0, + "step": 20926 + }, + { + "epoch": 2.662129500063605, + "grad_norm": 1.6573821306228638, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.866025984287262, + "num_tokens": 798377764.0, + "step": 20927 + }, + { + "epoch": 2.662256710342196, + "grad_norm": 1.4534528255462646, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8898875117301941, + "num_tokens": 798417330.0, + "step": 20928 + }, + { + "epoch": 2.662383920620786, + "grad_norm": 1.6019483804702759, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8914502859115601, + "num_tokens": 798453951.0, + "step": 20929 + }, + { + "epoch": 2.6625111308993765, + "grad_norm": 1.5499306917190552, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8855885863304138, + "num_tokens": 798495985.0, + "step": 20930 + }, + { + "epoch": 2.662638341177967, + "grad_norm": 1.5105705261230469, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8955230116844177, + "num_tokens": 798531598.0, + "step": 20931 + }, + { + "epoch": 2.6627655514565576, + "grad_norm": 1.6862772703170776, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8838191628456116, + "num_tokens": 798568243.0, + "step": 20932 + }, + { + "epoch": 2.662892761735148, + "grad_norm": 1.600942611694336, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8858498334884644, + "num_tokens": 798604659.0, + "step": 20933 + }, + { + "epoch": 2.6630199720137386, + "grad_norm": 1.7276736497879028, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.876046895980835, + "num_tokens": 798641765.0, + "step": 20934 + }, + { + "epoch": 2.663147182292329, + "grad_norm": 1.4704325199127197, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.8950004577636719, + "num_tokens": 798681055.0, + "step": 20935 + }, + { + "epoch": 2.6632743925709197, + "grad_norm": 1.5778887271881104, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8754258155822754, + "num_tokens": 798723050.0, + "step": 20936 + }, + { + "epoch": 2.66340160284951, + "grad_norm": 1.6110283136367798, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.883643388748169, + "num_tokens": 798766064.0, + "step": 20937 + }, + { + "epoch": 2.6635288131281007, + "grad_norm": 1.517777919769287, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.883589506149292, + "num_tokens": 798807070.0, + "step": 20938 + }, + { + "epoch": 2.6636560234066913, + "grad_norm": 1.544345498085022, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8833240270614624, + "num_tokens": 798847611.0, + "step": 20939 + }, + { + "epoch": 2.663783233685282, + "grad_norm": 1.580634355545044, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8669798970222473, + "num_tokens": 798888172.0, + "step": 20940 + }, + { + "epoch": 2.6639104439638723, + "grad_norm": 1.5104622840881348, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8914337158203125, + "num_tokens": 798924998.0, + "step": 20941 + }, + { + "epoch": 2.664037654242463, + "grad_norm": 1.4651192426681519, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8913936614990234, + "num_tokens": 798967346.0, + "step": 20942 + }, + { + "epoch": 2.6641648645210534, + "grad_norm": 1.4672595262527466, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8855894207954407, + "num_tokens": 799009374.0, + "step": 20943 + }, + { + "epoch": 2.664292074799644, + "grad_norm": 1.5575753450393677, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8887729644775391, + "num_tokens": 799044028.0, + "step": 20944 + }, + { + "epoch": 2.6644192850782344, + "grad_norm": 1.6440837383270264, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8697499632835388, + "num_tokens": 799084658.0, + "step": 20945 + }, + { + "epoch": 2.664546495356825, + "grad_norm": 1.6642162799835205, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.885922908782959, + "num_tokens": 799119436.0, + "step": 20946 + }, + { + "epoch": 2.6646737056354155, + "grad_norm": 1.4899159669876099, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8857814073562622, + "num_tokens": 799160466.0, + "step": 20947 + }, + { + "epoch": 2.6648009159140056, + "grad_norm": 1.413217544555664, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8932858109474182, + "num_tokens": 799201382.0, + "step": 20948 + }, + { + "epoch": 2.6649281261925966, + "grad_norm": 1.7343848943710327, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8596391081809998, + "num_tokens": 799236515.0, + "step": 20949 + }, + { + "epoch": 2.6650553364711866, + "grad_norm": 1.4105902910232544, + "learning_rate": 1e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.897737443447113, + "num_tokens": 799278327.0, + "step": 20950 + }, + { + "epoch": 2.6651825467497776, + "grad_norm": 1.4480664730072021, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8784831762313843, + "num_tokens": 799321858.0, + "step": 20951 + }, + { + "epoch": 2.6653097570283677, + "grad_norm": 1.699049472808838, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8891616463661194, + "num_tokens": 799357239.0, + "step": 20952 + }, + { + "epoch": 2.6654369673069587, + "grad_norm": 1.632083535194397, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8767328858375549, + "num_tokens": 799395680.0, + "step": 20953 + }, + { + "epoch": 2.6655641775855488, + "grad_norm": 1.6193238496780396, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8875668048858643, + "num_tokens": 799432862.0, + "step": 20954 + }, + { + "epoch": 2.6656913878641393, + "grad_norm": 1.5803613662719727, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8711614012718201, + "num_tokens": 799471826.0, + "step": 20955 + }, + { + "epoch": 2.66581859814273, + "grad_norm": 1.683052897453308, + "learning_rate": 1e-06, + "loss": 0.2774, + "mean_token_accuracy": 0.9012095928192139, + "num_tokens": 799501513.0, + "step": 20956 + }, + { + "epoch": 2.6659458084213203, + "grad_norm": 1.8955811262130737, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8760994672775269, + "num_tokens": 799532822.0, + "step": 20957 + }, + { + "epoch": 2.666073018699911, + "grad_norm": 1.6170510053634644, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8828129768371582, + "num_tokens": 799569578.0, + "step": 20958 + }, + { + "epoch": 2.6662002289785014, + "grad_norm": 1.7692174911499023, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8681613206863403, + "num_tokens": 799603342.0, + "step": 20959 + }, + { + "epoch": 2.666327439257092, + "grad_norm": 1.4726191759109497, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8867298364639282, + "num_tokens": 799645117.0, + "step": 20960 + }, + { + "epoch": 2.6664546495356825, + "grad_norm": 1.7710586786270142, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8768522143363953, + "num_tokens": 799679550.0, + "step": 20961 + }, + { + "epoch": 2.666581859814273, + "grad_norm": 1.5974206924438477, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.883966326713562, + "num_tokens": 799716107.0, + "step": 20962 + }, + { + "epoch": 2.6667090700928635, + "grad_norm": 1.4524290561676025, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8919309377670288, + "num_tokens": 799757045.0, + "step": 20963 + }, + { + "epoch": 2.666836280371454, + "grad_norm": 1.592911720275879, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.892589807510376, + "num_tokens": 799797543.0, + "step": 20964 + }, + { + "epoch": 2.6669634906500446, + "grad_norm": 1.5704063177108765, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8814128637313843, + "num_tokens": 799838456.0, + "step": 20965 + }, + { + "epoch": 2.667090700928635, + "grad_norm": 1.5864123106002808, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.889045000076294, + "num_tokens": 799878968.0, + "step": 20966 + }, + { + "epoch": 2.6672179112072256, + "grad_norm": 1.4338549375534058, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.894957959651947, + "num_tokens": 799920792.0, + "step": 20967 + }, + { + "epoch": 2.667345121485816, + "grad_norm": 1.5651438236236572, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8795446157455444, + "num_tokens": 799963298.0, + "step": 20968 + }, + { + "epoch": 2.6674723317644067, + "grad_norm": 1.5615849494934082, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.885979413986206, + "num_tokens": 800000115.0, + "step": 20969 + }, + { + "epoch": 2.667599542042997, + "grad_norm": 1.527260661125183, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.891459047794342, + "num_tokens": 800038101.0, + "step": 20970 + }, + { + "epoch": 2.6677267523215877, + "grad_norm": 1.5118463039398193, + "learning_rate": 1e-06, + "loss": 0.2817, + "mean_token_accuracy": 0.8953046798706055, + "num_tokens": 800075446.0, + "step": 20971 + }, + { + "epoch": 2.6678539626001783, + "grad_norm": 1.4841958284378052, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8885760307312012, + "num_tokens": 800115445.0, + "step": 20972 + }, + { + "epoch": 2.6679811728787683, + "grad_norm": 1.5594933032989502, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8918658494949341, + "num_tokens": 800155461.0, + "step": 20973 + }, + { + "epoch": 2.6681083831573593, + "grad_norm": 1.5016684532165527, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.893308699131012, + "num_tokens": 800195603.0, + "step": 20974 + }, + { + "epoch": 2.6682355934359494, + "grad_norm": 1.439289927482605, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.8946132063865662, + "num_tokens": 800232138.0, + "step": 20975 + }, + { + "epoch": 2.6683628037145404, + "grad_norm": 1.5925959348678589, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8894280791282654, + "num_tokens": 800268378.0, + "step": 20976 + }, + { + "epoch": 2.6684900139931305, + "grad_norm": 1.6766841411590576, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8785697221755981, + "num_tokens": 800302341.0, + "step": 20977 + }, + { + "epoch": 2.668617224271721, + "grad_norm": 1.5742992162704468, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8771454691886902, + "num_tokens": 800341028.0, + "step": 20978 + }, + { + "epoch": 2.6687444345503115, + "grad_norm": 1.5172606706619263, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8735847473144531, + "num_tokens": 800383043.0, + "step": 20979 + }, + { + "epoch": 2.668871644828902, + "grad_norm": 1.41525137424469, + "learning_rate": 1e-06, + "loss": 0.2838, + "mean_token_accuracy": 0.898014485836029, + "num_tokens": 800423417.0, + "step": 20980 + }, + { + "epoch": 2.6689988551074926, + "grad_norm": 1.5311901569366455, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8869062662124634, + "num_tokens": 800460168.0, + "step": 20981 + }, + { + "epoch": 2.669126065386083, + "grad_norm": 1.5215089321136475, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8838943839073181, + "num_tokens": 800498940.0, + "step": 20982 + }, + { + "epoch": 2.6692532756646736, + "grad_norm": 1.5478566884994507, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8880266547203064, + "num_tokens": 800534309.0, + "step": 20983 + }, + { + "epoch": 2.669380485943264, + "grad_norm": 1.522557020187378, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.889039158821106, + "num_tokens": 800571280.0, + "step": 20984 + }, + { + "epoch": 2.6695076962218547, + "grad_norm": 1.5291768312454224, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8806494474411011, + "num_tokens": 800610598.0, + "step": 20985 + }, + { + "epoch": 2.669634906500445, + "grad_norm": 1.5142439603805542, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8784785866737366, + "num_tokens": 800650252.0, + "step": 20986 + }, + { + "epoch": 2.6697621167790357, + "grad_norm": 1.600525975227356, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8878343105316162, + "num_tokens": 800685324.0, + "step": 20987 + }, + { + "epoch": 2.6698893270576263, + "grad_norm": 1.546212077140808, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8787825703620911, + "num_tokens": 800729456.0, + "step": 20988 + }, + { + "epoch": 2.670016537336217, + "grad_norm": 1.6607719659805298, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8755972981452942, + "num_tokens": 800765727.0, + "step": 20989 + }, + { + "epoch": 2.6701437476148073, + "grad_norm": 1.5220363140106201, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8716428279876709, + "num_tokens": 800806079.0, + "step": 20990 + }, + { + "epoch": 2.670270957893398, + "grad_norm": 1.5525314807891846, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8730931282043457, + "num_tokens": 800846636.0, + "step": 20991 + }, + { + "epoch": 2.6703981681719884, + "grad_norm": 1.5343846082687378, + "learning_rate": 1e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8944994807243347, + "num_tokens": 800885951.0, + "step": 20992 + }, + { + "epoch": 2.670525378450579, + "grad_norm": 1.5446423292160034, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8890367746353149, + "num_tokens": 800923733.0, + "step": 20993 + }, + { + "epoch": 2.6706525887291694, + "grad_norm": 1.6137573719024658, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8864765167236328, + "num_tokens": 800958952.0, + "step": 20994 + }, + { + "epoch": 2.67077979900776, + "grad_norm": 1.6349427700042725, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8932126760482788, + "num_tokens": 800995287.0, + "step": 20995 + }, + { + "epoch": 2.6709070092863505, + "grad_norm": 1.468239188194275, + "learning_rate": 1e-06, + "loss": 0.2792, + "mean_token_accuracy": 0.8973146677017212, + "num_tokens": 801030670.0, + "step": 20996 + }, + { + "epoch": 2.671034219564941, + "grad_norm": 1.4605045318603516, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8843518495559692, + "num_tokens": 801072455.0, + "step": 20997 + }, + { + "epoch": 2.671161429843531, + "grad_norm": 1.476785659790039, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.889498233795166, + "num_tokens": 801109119.0, + "step": 20998 + }, + { + "epoch": 2.671288640122122, + "grad_norm": 1.7419893741607666, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8796681761741638, + "num_tokens": 801141349.0, + "step": 20999 + }, + { + "epoch": 2.671415850400712, + "grad_norm": 1.4695274829864502, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.8915753364562988, + "num_tokens": 801184395.0, + "step": 21000 + }, + { + "epoch": 2.671543060679303, + "grad_norm": 1.5092846155166626, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8812386989593506, + "num_tokens": 801224079.0, + "step": 21001 + }, + { + "epoch": 2.6716702709578932, + "grad_norm": 1.647619366645813, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8843077421188354, + "num_tokens": 801259193.0, + "step": 21002 + }, + { + "epoch": 2.6717974812364838, + "grad_norm": 1.6522547006607056, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8749092817306519, + "num_tokens": 801295861.0, + "step": 21003 + }, + { + "epoch": 2.6719246915150743, + "grad_norm": 1.6723002195358276, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8830881118774414, + "num_tokens": 801330365.0, + "step": 21004 + }, + { + "epoch": 2.672051901793665, + "grad_norm": 1.463882565498352, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8781315088272095, + "num_tokens": 801372218.0, + "step": 21005 + }, + { + "epoch": 2.6721791120722553, + "grad_norm": 1.4730921983718872, + "learning_rate": 1e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.8943189978599548, + "num_tokens": 801413002.0, + "step": 21006 + }, + { + "epoch": 2.672306322350846, + "grad_norm": 1.58401620388031, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8719358444213867, + "num_tokens": 801454530.0, + "step": 21007 + }, + { + "epoch": 2.6724335326294364, + "grad_norm": 1.534649133682251, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8976249694824219, + "num_tokens": 801491029.0, + "step": 21008 + }, + { + "epoch": 2.672560742908027, + "grad_norm": 1.4020627737045288, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8844256401062012, + "num_tokens": 801537908.0, + "step": 21009 + }, + { + "epoch": 2.6726879531866174, + "grad_norm": 1.687922477722168, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8674718141555786, + "num_tokens": 801575349.0, + "step": 21010 + }, + { + "epoch": 2.672815163465208, + "grad_norm": 1.5616751909255981, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8769190311431885, + "num_tokens": 801609327.0, + "step": 21011 + }, + { + "epoch": 2.6729423737437985, + "grad_norm": 1.4500505924224854, + "learning_rate": 1e-06, + "loss": 0.2572, + "mean_token_accuracy": 0.9089974761009216, + "num_tokens": 801646729.0, + "step": 21012 + }, + { + "epoch": 2.673069584022389, + "grad_norm": 1.5771141052246094, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8762340545654297, + "num_tokens": 801683899.0, + "step": 21013 + }, + { + "epoch": 2.6731967943009796, + "grad_norm": 1.6093915700912476, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8848165273666382, + "num_tokens": 801720186.0, + "step": 21014 + }, + { + "epoch": 2.67332400457957, + "grad_norm": 1.5755393505096436, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8950635194778442, + "num_tokens": 801754564.0, + "step": 21015 + }, + { + "epoch": 2.6734512148581606, + "grad_norm": 1.5736039876937866, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8832724094390869, + "num_tokens": 801792433.0, + "step": 21016 + }, + { + "epoch": 2.673578425136751, + "grad_norm": 1.5306464433670044, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8813105225563049, + "num_tokens": 801833254.0, + "step": 21017 + }, + { + "epoch": 2.6737056354153417, + "grad_norm": 1.5412650108337402, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8915602564811707, + "num_tokens": 801871163.0, + "step": 21018 + }, + { + "epoch": 2.673832845693932, + "grad_norm": 1.6970185041427612, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8886328339576721, + "num_tokens": 801904740.0, + "step": 21019 + }, + { + "epoch": 2.6739600559725227, + "grad_norm": 1.569061279296875, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8924403786659241, + "num_tokens": 801946270.0, + "step": 21020 + }, + { + "epoch": 2.674087266251113, + "grad_norm": 1.6381808519363403, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8809411525726318, + "num_tokens": 801981358.0, + "step": 21021 + }, + { + "epoch": 2.674214476529704, + "grad_norm": 1.6643052101135254, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8904972076416016, + "num_tokens": 802013817.0, + "step": 21022 + }, + { + "epoch": 2.674341686808294, + "grad_norm": 1.4814671277999878, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8906919956207275, + "num_tokens": 802051816.0, + "step": 21023 + }, + { + "epoch": 2.674468897086885, + "grad_norm": 1.4950920343399048, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8795464634895325, + "num_tokens": 802091116.0, + "step": 21024 + }, + { + "epoch": 2.674596107365475, + "grad_norm": 1.557518482208252, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8797541856765747, + "num_tokens": 802129238.0, + "step": 21025 + }, + { + "epoch": 2.674723317644066, + "grad_norm": 1.4463262557983398, + "learning_rate": 1e-06, + "loss": 0.2545, + "mean_token_accuracy": 0.907741904258728, + "num_tokens": 802167030.0, + "step": 21026 + }, + { + "epoch": 2.674850527922656, + "grad_norm": 1.5679394006729126, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8928477168083191, + "num_tokens": 802204405.0, + "step": 21027 + }, + { + "epoch": 2.6749777382012465, + "grad_norm": 1.7666125297546387, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8817887306213379, + "num_tokens": 802236785.0, + "step": 21028 + }, + { + "epoch": 2.675104948479837, + "grad_norm": 1.8506718873977661, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8862638473510742, + "num_tokens": 802269981.0, + "step": 21029 + }, + { + "epoch": 2.6752321587584276, + "grad_norm": 1.4842370748519897, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.8994025588035583, + "num_tokens": 802306980.0, + "step": 21030 + }, + { + "epoch": 2.675359369037018, + "grad_norm": 1.3617340326309204, + "learning_rate": 1e-06, + "loss": 0.257, + "mean_token_accuracy": 0.9064679741859436, + "num_tokens": 802349504.0, + "step": 21031 + }, + { + "epoch": 2.6754865793156086, + "grad_norm": 1.538068413734436, + "learning_rate": 1e-06, + "loss": 0.2795, + "mean_token_accuracy": 0.8994755744934082, + "num_tokens": 802383992.0, + "step": 21032 + }, + { + "epoch": 2.675613789594199, + "grad_norm": 1.467448353767395, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8909417390823364, + "num_tokens": 802427048.0, + "step": 21033 + }, + { + "epoch": 2.6757409998727897, + "grad_norm": 1.7897158861160278, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8769254088401794, + "num_tokens": 802462942.0, + "step": 21034 + }, + { + "epoch": 2.67586821015138, + "grad_norm": 1.32866370677948, + "learning_rate": 1e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.8988550901412964, + "num_tokens": 802507713.0, + "step": 21035 + }, + { + "epoch": 2.6759954204299707, + "grad_norm": 1.7413451671600342, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8731503486633301, + "num_tokens": 802539767.0, + "step": 21036 + }, + { + "epoch": 2.6761226307085613, + "grad_norm": 1.7016760110855103, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8906248807907104, + "num_tokens": 802575157.0, + "step": 21037 + }, + { + "epoch": 2.676249840987152, + "grad_norm": 1.4389034509658813, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8897016048431396, + "num_tokens": 802619080.0, + "step": 21038 + }, + { + "epoch": 2.6763770512657423, + "grad_norm": 1.6088237762451172, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8743714690208435, + "num_tokens": 802658151.0, + "step": 21039 + }, + { + "epoch": 2.676504261544333, + "grad_norm": 1.7494442462921143, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8791767358779907, + "num_tokens": 802688424.0, + "step": 21040 + }, + { + "epoch": 2.6766314718229234, + "grad_norm": 1.5161734819412231, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8943631052970886, + "num_tokens": 802724334.0, + "step": 21041 + }, + { + "epoch": 2.676758682101514, + "grad_norm": 1.6674909591674805, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8781446218490601, + "num_tokens": 802761097.0, + "step": 21042 + }, + { + "epoch": 2.6768858923801044, + "grad_norm": 1.569516658782959, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8909459710121155, + "num_tokens": 802797791.0, + "step": 21043 + }, + { + "epoch": 2.677013102658695, + "grad_norm": 1.561864972114563, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8871546983718872, + "num_tokens": 802833598.0, + "step": 21044 + }, + { + "epoch": 2.6771403129372855, + "grad_norm": 1.5325111150741577, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8963723182678223, + "num_tokens": 802869975.0, + "step": 21045 + }, + { + "epoch": 2.6772675232158756, + "grad_norm": 1.567496657371521, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8774893283843994, + "num_tokens": 802908525.0, + "step": 21046 + }, + { + "epoch": 2.6773947334944665, + "grad_norm": 1.5922236442565918, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8877655267715454, + "num_tokens": 802943842.0, + "step": 21047 + }, + { + "epoch": 2.6775219437730566, + "grad_norm": 1.5957001447677612, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8791394233703613, + "num_tokens": 802983996.0, + "step": 21048 + }, + { + "epoch": 2.6776491540516476, + "grad_norm": 1.7337411642074585, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8935226798057556, + "num_tokens": 803015573.0, + "step": 21049 + }, + { + "epoch": 2.6777763643302377, + "grad_norm": 1.6020935773849487, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8881847262382507, + "num_tokens": 803049815.0, + "step": 21050 + }, + { + "epoch": 2.6779035746088287, + "grad_norm": 1.5551148653030396, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8779767751693726, + "num_tokens": 803092449.0, + "step": 21051 + }, + { + "epoch": 2.6780307848874187, + "grad_norm": 1.6279685497283936, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8885587453842163, + "num_tokens": 803123920.0, + "step": 21052 + }, + { + "epoch": 2.6781579951660093, + "grad_norm": 1.5426172018051147, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8786447644233704, + "num_tokens": 803164604.0, + "step": 21053 + }, + { + "epoch": 2.6782852054446, + "grad_norm": 1.5013242959976196, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8722987771034241, + "num_tokens": 803207390.0, + "step": 21054 + }, + { + "epoch": 2.6784124157231903, + "grad_norm": 1.4301176071166992, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8821932077407837, + "num_tokens": 803253504.0, + "step": 21055 + }, + { + "epoch": 2.678539626001781, + "grad_norm": 1.6472954750061035, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8881011009216309, + "num_tokens": 803287458.0, + "step": 21056 + }, + { + "epoch": 2.6786668362803714, + "grad_norm": 1.5786134004592896, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8705897927284241, + "num_tokens": 803327863.0, + "step": 21057 + }, + { + "epoch": 2.678794046558962, + "grad_norm": 1.645338535308838, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8691409826278687, + "num_tokens": 803368050.0, + "step": 21058 + }, + { + "epoch": 2.6789212568375524, + "grad_norm": 1.4328562021255493, + "learning_rate": 1e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.8941360712051392, + "num_tokens": 803408349.0, + "step": 21059 + }, + { + "epoch": 2.679048467116143, + "grad_norm": 1.6317965984344482, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8853325247764587, + "num_tokens": 803445525.0, + "step": 21060 + }, + { + "epoch": 2.6791756773947335, + "grad_norm": 1.5628128051757812, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8868814706802368, + "num_tokens": 803486043.0, + "step": 21061 + }, + { + "epoch": 2.679302887673324, + "grad_norm": 1.6242619752883911, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8798542022705078, + "num_tokens": 803524000.0, + "step": 21062 + }, + { + "epoch": 2.6794300979519146, + "grad_norm": 1.7085206508636475, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8827047944068909, + "num_tokens": 803558036.0, + "step": 21063 + }, + { + "epoch": 2.679557308230505, + "grad_norm": 1.4946566820144653, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8947838544845581, + "num_tokens": 803593888.0, + "step": 21064 + }, + { + "epoch": 2.6796845185090956, + "grad_norm": 1.4340407848358154, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8930751085281372, + "num_tokens": 803637758.0, + "step": 21065 + }, + { + "epoch": 2.679811728787686, + "grad_norm": 1.4863238334655762, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8803056478500366, + "num_tokens": 803677760.0, + "step": 21066 + }, + { + "epoch": 2.6799389390662767, + "grad_norm": 1.4774221181869507, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8931993246078491, + "num_tokens": 803717355.0, + "step": 21067 + }, + { + "epoch": 2.680066149344867, + "grad_norm": 1.5627086162567139, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8940830230712891, + "num_tokens": 803752904.0, + "step": 21068 + }, + { + "epoch": 2.6801933596234577, + "grad_norm": 1.6663511991500854, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8628028631210327, + "num_tokens": 803791404.0, + "step": 21069 + }, + { + "epoch": 2.6803205699020483, + "grad_norm": 1.593863606452942, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8861229419708252, + "num_tokens": 803826803.0, + "step": 21070 + }, + { + "epoch": 2.6804477801806383, + "grad_norm": 1.494665503501892, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.868280291557312, + "num_tokens": 803872679.0, + "step": 21071 + }, + { + "epoch": 2.6805749904592293, + "grad_norm": 1.5216195583343506, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.892536461353302, + "num_tokens": 803911036.0, + "step": 21072 + }, + { + "epoch": 2.6807022007378194, + "grad_norm": 1.5032004117965698, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8906580209732056, + "num_tokens": 803950208.0, + "step": 21073 + }, + { + "epoch": 2.6808294110164104, + "grad_norm": 1.4869338274002075, + "learning_rate": 1e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.8977300524711609, + "num_tokens": 803987351.0, + "step": 21074 + }, + { + "epoch": 2.6809566212950005, + "grad_norm": 1.4614338874816895, + "learning_rate": 1e-06, + "loss": 0.2673, + "mean_token_accuracy": 0.9043852686882019, + "num_tokens": 804026546.0, + "step": 21075 + }, + { + "epoch": 2.681083831573591, + "grad_norm": 1.5671466588974, + "learning_rate": 1e-06, + "loss": 0.288, + "mean_token_accuracy": 0.8958761692047119, + "num_tokens": 804063402.0, + "step": 21076 + }, + { + "epoch": 2.6812110418521815, + "grad_norm": 1.5437424182891846, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8807110786437988, + "num_tokens": 804102960.0, + "step": 21077 + }, + { + "epoch": 2.681338252130772, + "grad_norm": 1.6439826488494873, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8782278299331665, + "num_tokens": 804141127.0, + "step": 21078 + }, + { + "epoch": 2.6814654624093626, + "grad_norm": 1.64198637008667, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8809138536453247, + "num_tokens": 804177675.0, + "step": 21079 + }, + { + "epoch": 2.681592672687953, + "grad_norm": 2.3050694465637207, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8878985643386841, + "num_tokens": 804216870.0, + "step": 21080 + }, + { + "epoch": 2.6817198829665436, + "grad_norm": 1.5435817241668701, + "learning_rate": 1e-06, + "loss": 0.2634, + "mean_token_accuracy": 0.902292788028717, + "num_tokens": 804252431.0, + "step": 21081 + }, + { + "epoch": 2.681847093245134, + "grad_norm": 1.4970725774765015, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8785903453826904, + "num_tokens": 804295202.0, + "step": 21082 + }, + { + "epoch": 2.6819743035237247, + "grad_norm": 1.686594009399414, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8802191019058228, + "num_tokens": 804329732.0, + "step": 21083 + }, + { + "epoch": 2.682101513802315, + "grad_norm": 1.5990350246429443, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8795536756515503, + "num_tokens": 804370904.0, + "step": 21084 + }, + { + "epoch": 2.6822287240809057, + "grad_norm": 1.6160751581192017, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8968213796615601, + "num_tokens": 804404633.0, + "step": 21085 + }, + { + "epoch": 2.6823559343594963, + "grad_norm": 1.6740407943725586, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.892457127571106, + "num_tokens": 804434846.0, + "step": 21086 + }, + { + "epoch": 2.682483144638087, + "grad_norm": 1.628515362739563, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8956649303436279, + "num_tokens": 804467627.0, + "step": 21087 + }, + { + "epoch": 2.6826103549166773, + "grad_norm": 1.5148224830627441, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8812050819396973, + "num_tokens": 804511278.0, + "step": 21088 + }, + { + "epoch": 2.682737565195268, + "grad_norm": 1.528791069984436, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8914545774459839, + "num_tokens": 804550505.0, + "step": 21089 + }, + { + "epoch": 2.6828647754738584, + "grad_norm": 1.535233736038208, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8800346255302429, + "num_tokens": 804591426.0, + "step": 21090 + }, + { + "epoch": 2.682991985752449, + "grad_norm": 1.5493241548538208, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8927537798881531, + "num_tokens": 804626337.0, + "step": 21091 + }, + { + "epoch": 2.6831191960310394, + "grad_norm": 1.5705292224884033, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.884247899055481, + "num_tokens": 804665028.0, + "step": 21092 + }, + { + "epoch": 2.68324640630963, + "grad_norm": 1.4778800010681152, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8788048028945923, + "num_tokens": 804707937.0, + "step": 21093 + }, + { + "epoch": 2.6833736165882205, + "grad_norm": 1.4957748651504517, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8915952444076538, + "num_tokens": 804745667.0, + "step": 21094 + }, + { + "epoch": 2.683500826866811, + "grad_norm": 1.6849523782730103, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.880988359451294, + "num_tokens": 804781253.0, + "step": 21095 + }, + { + "epoch": 2.683628037145401, + "grad_norm": 1.9621907472610474, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8874529004096985, + "num_tokens": 804816040.0, + "step": 21096 + }, + { + "epoch": 2.683755247423992, + "grad_norm": 1.5301334857940674, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8921482563018799, + "num_tokens": 804851625.0, + "step": 21097 + }, + { + "epoch": 2.683882457702582, + "grad_norm": 1.6436914205551147, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.883423924446106, + "num_tokens": 804888823.0, + "step": 21098 + }, + { + "epoch": 2.684009667981173, + "grad_norm": 1.4666624069213867, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8853217959403992, + "num_tokens": 804933113.0, + "step": 21099 + }, + { + "epoch": 2.684136878259763, + "grad_norm": 1.413404107093811, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8904627561569214, + "num_tokens": 804976423.0, + "step": 21100 + }, + { + "epoch": 2.6842640885383537, + "grad_norm": 1.4616026878356934, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8883600234985352, + "num_tokens": 805015306.0, + "step": 21101 + }, + { + "epoch": 2.6843912988169443, + "grad_norm": 1.5868711471557617, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8948990702629089, + "num_tokens": 805054852.0, + "step": 21102 + }, + { + "epoch": 2.684518509095535, + "grad_norm": 1.41289222240448, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8975530862808228, + "num_tokens": 805094928.0, + "step": 21103 + }, + { + "epoch": 2.6846457193741253, + "grad_norm": 1.4987707138061523, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8885073065757751, + "num_tokens": 805133420.0, + "step": 21104 + }, + { + "epoch": 2.684772929652716, + "grad_norm": 1.4494537115097046, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.8901475667953491, + "num_tokens": 805172713.0, + "step": 21105 + }, + { + "epoch": 2.6849001399313064, + "grad_norm": 1.4770361185073853, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.892499566078186, + "num_tokens": 805212127.0, + "step": 21106 + }, + { + "epoch": 2.685027350209897, + "grad_norm": 1.5739564895629883, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8902217149734497, + "num_tokens": 805248371.0, + "step": 21107 + }, + { + "epoch": 2.6851545604884874, + "grad_norm": 1.4689632654190063, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8928453922271729, + "num_tokens": 805289262.0, + "step": 21108 + }, + { + "epoch": 2.685281770767078, + "grad_norm": 1.6861132383346558, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8850441575050354, + "num_tokens": 805325698.0, + "step": 21109 + }, + { + "epoch": 2.6854089810456685, + "grad_norm": 1.5183125734329224, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8810436129570007, + "num_tokens": 805369439.0, + "step": 21110 + }, + { + "epoch": 2.685536191324259, + "grad_norm": 1.5371792316436768, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.886672854423523, + "num_tokens": 805405535.0, + "step": 21111 + }, + { + "epoch": 2.6856634016028496, + "grad_norm": 1.5706589221954346, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8810297846794128, + "num_tokens": 805443396.0, + "step": 21112 + }, + { + "epoch": 2.68579061188144, + "grad_norm": 1.680300235748291, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8948372602462769, + "num_tokens": 805477838.0, + "step": 21113 + }, + { + "epoch": 2.6859178221600306, + "grad_norm": 1.4778236150741577, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8861695528030396, + "num_tokens": 805518322.0, + "step": 21114 + }, + { + "epoch": 2.686045032438621, + "grad_norm": 1.6323552131652832, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8740053772926331, + "num_tokens": 805558555.0, + "step": 21115 + }, + { + "epoch": 2.6861722427172117, + "grad_norm": 1.7032040357589722, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8736017942428589, + "num_tokens": 805593893.0, + "step": 21116 + }, + { + "epoch": 2.686299452995802, + "grad_norm": 1.5100576877593994, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8920577764511108, + "num_tokens": 805634225.0, + "step": 21117 + }, + { + "epoch": 2.6864266632743927, + "grad_norm": 1.4481782913208008, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8984399437904358, + "num_tokens": 805674925.0, + "step": 21118 + }, + { + "epoch": 2.686553873552983, + "grad_norm": 1.5169864892959595, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8894174098968506, + "num_tokens": 805714298.0, + "step": 21119 + }, + { + "epoch": 2.686681083831574, + "grad_norm": 1.7170968055725098, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8837608695030212, + "num_tokens": 805753521.0, + "step": 21120 + }, + { + "epoch": 2.686808294110164, + "grad_norm": 1.5057480335235596, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.884796679019928, + "num_tokens": 805793771.0, + "step": 21121 + }, + { + "epoch": 2.686935504388755, + "grad_norm": 1.488044261932373, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8793734908103943, + "num_tokens": 805836108.0, + "step": 21122 + }, + { + "epoch": 2.687062714667345, + "grad_norm": 1.605800986289978, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8824290037155151, + "num_tokens": 805875819.0, + "step": 21123 + }, + { + "epoch": 2.687189924945936, + "grad_norm": 1.7020065784454346, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8753553628921509, + "num_tokens": 805908648.0, + "step": 21124 + }, + { + "epoch": 2.687317135224526, + "grad_norm": 1.5826280117034912, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8782951831817627, + "num_tokens": 805948780.0, + "step": 21125 + }, + { + "epoch": 2.6874443455031165, + "grad_norm": 1.5602704286575317, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8935523629188538, + "num_tokens": 805987854.0, + "step": 21126 + }, + { + "epoch": 2.687571555781707, + "grad_norm": 1.5267030000686646, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8881194591522217, + "num_tokens": 806027355.0, + "step": 21127 + }, + { + "epoch": 2.6876987660602976, + "grad_norm": 1.5578211545944214, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8769856691360474, + "num_tokens": 806069945.0, + "step": 21128 + }, + { + "epoch": 2.687825976338888, + "grad_norm": 1.5348867177963257, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.896129846572876, + "num_tokens": 806108807.0, + "step": 21129 + }, + { + "epoch": 2.6879531866174786, + "grad_norm": 1.4708194732666016, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8949843645095825, + "num_tokens": 806149542.0, + "step": 21130 + }, + { + "epoch": 2.688080396896069, + "grad_norm": 1.4250253438949585, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8777731657028198, + "num_tokens": 806194700.0, + "step": 21131 + }, + { + "epoch": 2.6882076071746597, + "grad_norm": 1.6087239980697632, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.892711877822876, + "num_tokens": 806231867.0, + "step": 21132 + }, + { + "epoch": 2.68833481745325, + "grad_norm": 1.5861212015151978, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8773463368415833, + "num_tokens": 806271217.0, + "step": 21133 + }, + { + "epoch": 2.6884620277318407, + "grad_norm": 1.632176399230957, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8771485686302185, + "num_tokens": 806310463.0, + "step": 21134 + }, + { + "epoch": 2.6885892380104313, + "grad_norm": 1.5483700037002563, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8884459733963013, + "num_tokens": 806347960.0, + "step": 21135 + }, + { + "epoch": 2.688716448289022, + "grad_norm": 1.623340368270874, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8819055557250977, + "num_tokens": 806387572.0, + "step": 21136 + }, + { + "epoch": 2.6888436585676123, + "grad_norm": 1.437544584274292, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8897677659988403, + "num_tokens": 806430441.0, + "step": 21137 + }, + { + "epoch": 2.688970868846203, + "grad_norm": 1.488525152206421, + "learning_rate": 1e-06, + "loss": 0.2862, + "mean_token_accuracy": 0.8987080454826355, + "num_tokens": 806469194.0, + "step": 21138 + }, + { + "epoch": 2.6890980791247934, + "grad_norm": 1.629823923110962, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8885398507118225, + "num_tokens": 806505722.0, + "step": 21139 + }, + { + "epoch": 2.689225289403384, + "grad_norm": 1.6069639921188354, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8814070820808411, + "num_tokens": 806541919.0, + "step": 21140 + }, + { + "epoch": 2.6893524996819744, + "grad_norm": 1.615236759185791, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8889299631118774, + "num_tokens": 806579894.0, + "step": 21141 + }, + { + "epoch": 2.689479709960565, + "grad_norm": 1.6314712762832642, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8913614153862, + "num_tokens": 806617489.0, + "step": 21142 + }, + { + "epoch": 2.6896069202391555, + "grad_norm": 1.6327182054519653, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8804923295974731, + "num_tokens": 806657542.0, + "step": 21143 + }, + { + "epoch": 2.6897341305177456, + "grad_norm": 1.6681585311889648, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8788491487503052, + "num_tokens": 806692015.0, + "step": 21144 + }, + { + "epoch": 2.6898613407963365, + "grad_norm": 1.6751940250396729, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8990874290466309, + "num_tokens": 806725017.0, + "step": 21145 + }, + { + "epoch": 2.6899885510749266, + "grad_norm": 1.5749766826629639, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8859853744506836, + "num_tokens": 806765326.0, + "step": 21146 + }, + { + "epoch": 2.6901157613535176, + "grad_norm": 1.6909996271133423, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8773322105407715, + "num_tokens": 806804184.0, + "step": 21147 + }, + { + "epoch": 2.6902429716321077, + "grad_norm": 1.460095763206482, + "learning_rate": 1e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.8915739059448242, + "num_tokens": 806840016.0, + "step": 21148 + }, + { + "epoch": 2.6903701819106987, + "grad_norm": 1.5725758075714111, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8819897770881653, + "num_tokens": 806883858.0, + "step": 21149 + }, + { + "epoch": 2.6904973921892887, + "grad_norm": 1.4577436447143555, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8841092586517334, + "num_tokens": 806925281.0, + "step": 21150 + }, + { + "epoch": 2.6906246024678793, + "grad_norm": 1.5423516035079956, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.88719642162323, + "num_tokens": 806961833.0, + "step": 21151 + }, + { + "epoch": 2.69075181274647, + "grad_norm": 1.4630188941955566, + "learning_rate": 1e-06, + "loss": 0.273, + "mean_token_accuracy": 0.8984140753746033, + "num_tokens": 806998358.0, + "step": 21152 + }, + { + "epoch": 2.6908790230250603, + "grad_norm": 1.4883240461349487, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8845275640487671, + "num_tokens": 807043203.0, + "step": 21153 + }, + { + "epoch": 2.691006233303651, + "grad_norm": 1.6894272565841675, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8811633586883545, + "num_tokens": 807078387.0, + "step": 21154 + }, + { + "epoch": 2.6911334435822414, + "grad_norm": 1.5105807781219482, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8814515471458435, + "num_tokens": 807119153.0, + "step": 21155 + }, + { + "epoch": 2.691260653860832, + "grad_norm": 1.5832573175430298, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8735425472259521, + "num_tokens": 807155648.0, + "step": 21156 + }, + { + "epoch": 2.6913878641394224, + "grad_norm": 1.4304853677749634, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8856576681137085, + "num_tokens": 807196643.0, + "step": 21157 + }, + { + "epoch": 2.691515074418013, + "grad_norm": 1.4561100006103516, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8934183120727539, + "num_tokens": 807238738.0, + "step": 21158 + }, + { + "epoch": 2.6916422846966035, + "grad_norm": 1.454987645149231, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8904833793640137, + "num_tokens": 807279797.0, + "step": 21159 + }, + { + "epoch": 2.691769494975194, + "grad_norm": 1.4944764375686646, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8912971615791321, + "num_tokens": 807318996.0, + "step": 21160 + }, + { + "epoch": 2.6918967052537845, + "grad_norm": 1.4826974868774414, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8893944621086121, + "num_tokens": 807359467.0, + "step": 21161 + }, + { + "epoch": 2.692023915532375, + "grad_norm": 1.5207092761993408, + "learning_rate": 1e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.9006023406982422, + "num_tokens": 807398303.0, + "step": 21162 + }, + { + "epoch": 2.6921511258109656, + "grad_norm": 1.6031543016433716, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8847100734710693, + "num_tokens": 807435839.0, + "step": 21163 + }, + { + "epoch": 2.692278336089556, + "grad_norm": 1.443620204925537, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8890231847763062, + "num_tokens": 807480208.0, + "step": 21164 + }, + { + "epoch": 2.6924055463681467, + "grad_norm": 1.572265386581421, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8847582340240479, + "num_tokens": 807516784.0, + "step": 21165 + }, + { + "epoch": 2.692532756646737, + "grad_norm": 1.6222128868103027, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8917196989059448, + "num_tokens": 807550722.0, + "step": 21166 + }, + { + "epoch": 2.6926599669253277, + "grad_norm": 1.5621556043624878, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8885475993156433, + "num_tokens": 807587516.0, + "step": 21167 + }, + { + "epoch": 2.6927871772039182, + "grad_norm": 1.6807063817977905, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8772262334823608, + "num_tokens": 807622190.0, + "step": 21168 + }, + { + "epoch": 2.6929143874825083, + "grad_norm": 1.567379117012024, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8868423700332642, + "num_tokens": 807659457.0, + "step": 21169 + }, + { + "epoch": 2.6930415977610993, + "grad_norm": 1.563373327255249, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8867380619049072, + "num_tokens": 807694658.0, + "step": 21170 + }, + { + "epoch": 2.6931688080396894, + "grad_norm": 1.66703462600708, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8905050158500671, + "num_tokens": 807728856.0, + "step": 21171 + }, + { + "epoch": 2.6932960183182804, + "grad_norm": 1.4376137256622314, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8908300995826721, + "num_tokens": 807769848.0, + "step": 21172 + }, + { + "epoch": 2.6934232285968704, + "grad_norm": 1.4676909446716309, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8895429372787476, + "num_tokens": 807809045.0, + "step": 21173 + }, + { + "epoch": 2.693550438875461, + "grad_norm": 1.4735174179077148, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8941845893859863, + "num_tokens": 807848671.0, + "step": 21174 + }, + { + "epoch": 2.6936776491540515, + "grad_norm": 1.5264320373535156, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.884389340877533, + "num_tokens": 807888680.0, + "step": 21175 + }, + { + "epoch": 2.693804859432642, + "grad_norm": 1.5726820230484009, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8902335166931152, + "num_tokens": 807927655.0, + "step": 21176 + }, + { + "epoch": 2.6939320697112326, + "grad_norm": 1.5004360675811768, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8811317682266235, + "num_tokens": 807972648.0, + "step": 21177 + }, + { + "epoch": 2.694059279989823, + "grad_norm": 1.5254746675491333, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8907918930053711, + "num_tokens": 808009846.0, + "step": 21178 + }, + { + "epoch": 2.6941864902684136, + "grad_norm": 1.498985767364502, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8795880675315857, + "num_tokens": 808050240.0, + "step": 21179 + }, + { + "epoch": 2.694313700547004, + "grad_norm": 1.730843186378479, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8785333037376404, + "num_tokens": 808083160.0, + "step": 21180 + }, + { + "epoch": 2.6944409108255947, + "grad_norm": 1.5777398347854614, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8692269325256348, + "num_tokens": 808124361.0, + "step": 21181 + }, + { + "epoch": 2.694568121104185, + "grad_norm": 1.50840425491333, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8860107660293579, + "num_tokens": 808168456.0, + "step": 21182 + }, + { + "epoch": 2.6946953313827757, + "grad_norm": 1.6648997068405151, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8735964298248291, + "num_tokens": 808206912.0, + "step": 21183 + }, + { + "epoch": 2.6948225416613663, + "grad_norm": 1.4502298831939697, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.879586935043335, + "num_tokens": 808252202.0, + "step": 21184 + }, + { + "epoch": 2.694949751939957, + "grad_norm": 1.642885684967041, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.886657178401947, + "num_tokens": 808289593.0, + "step": 21185 + }, + { + "epoch": 2.6950769622185473, + "grad_norm": 1.4935520887374878, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8876721262931824, + "num_tokens": 808327381.0, + "step": 21186 + }, + { + "epoch": 2.695204172497138, + "grad_norm": 1.5551906824111938, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8807241916656494, + "num_tokens": 808365387.0, + "step": 21187 + }, + { + "epoch": 2.6953313827757284, + "grad_norm": 1.8086785078048706, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8625873327255249, + "num_tokens": 808398172.0, + "step": 21188 + }, + { + "epoch": 2.695458593054319, + "grad_norm": 1.6079453229904175, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8850389719009399, + "num_tokens": 808433090.0, + "step": 21189 + }, + { + "epoch": 2.6955858033329094, + "grad_norm": 1.72676420211792, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.868416428565979, + "num_tokens": 808465667.0, + "step": 21190 + }, + { + "epoch": 2.6957130136115, + "grad_norm": 1.5242730379104614, + "learning_rate": 1e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.8978840112686157, + "num_tokens": 808502814.0, + "step": 21191 + }, + { + "epoch": 2.6958402238900905, + "grad_norm": 1.4141764640808105, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8890366554260254, + "num_tokens": 808545630.0, + "step": 21192 + }, + { + "epoch": 2.695967434168681, + "grad_norm": 1.5850733518600464, + "learning_rate": 1e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.8945178985595703, + "num_tokens": 808581614.0, + "step": 21193 + }, + { + "epoch": 2.696094644447271, + "grad_norm": 1.6233421564102173, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8722859621047974, + "num_tokens": 808619779.0, + "step": 21194 + }, + { + "epoch": 2.696221854725862, + "grad_norm": 1.5613421201705933, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8832377791404724, + "num_tokens": 808659023.0, + "step": 21195 + }, + { + "epoch": 2.696349065004452, + "grad_norm": 1.5231502056121826, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8862333297729492, + "num_tokens": 808697016.0, + "step": 21196 + }, + { + "epoch": 2.696476275283043, + "grad_norm": 1.5946565866470337, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8864122629165649, + "num_tokens": 808731054.0, + "step": 21197 + }, + { + "epoch": 2.696603485561633, + "grad_norm": 1.5639129877090454, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8927366733551025, + "num_tokens": 808767736.0, + "step": 21198 + }, + { + "epoch": 2.6967306958402237, + "grad_norm": 1.5774117708206177, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8823854923248291, + "num_tokens": 808806128.0, + "step": 21199 + }, + { + "epoch": 2.6968579061188143, + "grad_norm": 1.4341626167297363, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8848198056221008, + "num_tokens": 808850916.0, + "step": 21200 + }, + { + "epoch": 2.696985116397405, + "grad_norm": 1.7208044528961182, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8723657727241516, + "num_tokens": 808886049.0, + "step": 21201 + }, + { + "epoch": 2.6971123266759953, + "grad_norm": 1.4812225103378296, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8884422183036804, + "num_tokens": 808926818.0, + "step": 21202 + }, + { + "epoch": 2.697239536954586, + "grad_norm": 1.7338857650756836, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8822680115699768, + "num_tokens": 808955739.0, + "step": 21203 + }, + { + "epoch": 2.6973667472331764, + "grad_norm": 1.6280475854873657, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8842964768409729, + "num_tokens": 808994078.0, + "step": 21204 + }, + { + "epoch": 2.697493957511767, + "grad_norm": 1.5397850275039673, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8938246369361877, + "num_tokens": 809030897.0, + "step": 21205 + }, + { + "epoch": 2.6976211677903574, + "grad_norm": 1.507556438446045, + "learning_rate": 1e-06, + "loss": 0.2801, + "mean_token_accuracy": 0.901276707649231, + "num_tokens": 809076597.0, + "step": 21206 + }, + { + "epoch": 2.697748378068948, + "grad_norm": 1.5936163663864136, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8754612803459167, + "num_tokens": 809117414.0, + "step": 21207 + }, + { + "epoch": 2.6978755883475385, + "grad_norm": 1.4774420261383057, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8943175673484802, + "num_tokens": 809157896.0, + "step": 21208 + }, + { + "epoch": 2.698002798626129, + "grad_norm": 1.567317247390747, + "learning_rate": 1e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.8974500894546509, + "num_tokens": 809194066.0, + "step": 21209 + }, + { + "epoch": 2.6981300089047195, + "grad_norm": 1.5513993501663208, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8842817544937134, + "num_tokens": 809233671.0, + "step": 21210 + }, + { + "epoch": 2.69825721918331, + "grad_norm": 1.5374451875686646, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8894206285476685, + "num_tokens": 809270813.0, + "step": 21211 + }, + { + "epoch": 2.6983844294619006, + "grad_norm": 1.517601490020752, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8877154588699341, + "num_tokens": 809315687.0, + "step": 21212 + }, + { + "epoch": 2.698511639740491, + "grad_norm": 1.667733907699585, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8832975029945374, + "num_tokens": 809350781.0, + "step": 21213 + }, + { + "epoch": 2.6986388500190817, + "grad_norm": 1.6105847358703613, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8896805047988892, + "num_tokens": 809384371.0, + "step": 21214 + }, + { + "epoch": 2.698766060297672, + "grad_norm": 1.555116891860962, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8839820027351379, + "num_tokens": 809420812.0, + "step": 21215 + }, + { + "epoch": 2.6988932705762627, + "grad_norm": 1.5093600749969482, + "learning_rate": 1e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.8958402276039124, + "num_tokens": 809458815.0, + "step": 21216 + }, + { + "epoch": 2.699020480854853, + "grad_norm": 1.6018810272216797, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8855302333831787, + "num_tokens": 809494987.0, + "step": 21217 + }, + { + "epoch": 2.6991476911334438, + "grad_norm": 1.6013089418411255, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.866908609867096, + "num_tokens": 809537523.0, + "step": 21218 + }, + { + "epoch": 2.699274901412034, + "grad_norm": 1.8190239667892456, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8865290880203247, + "num_tokens": 809576033.0, + "step": 21219 + }, + { + "epoch": 2.699402111690625, + "grad_norm": 1.7344156503677368, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8678890466690063, + "num_tokens": 809613821.0, + "step": 21220 + }, + { + "epoch": 2.699529321969215, + "grad_norm": 1.6429147720336914, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8834387063980103, + "num_tokens": 809650102.0, + "step": 21221 + }, + { + "epoch": 2.699656532247806, + "grad_norm": 1.5277026891708374, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8872894644737244, + "num_tokens": 809691046.0, + "step": 21222 + }, + { + "epoch": 2.699783742526396, + "grad_norm": 1.5649727582931519, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.894155740737915, + "num_tokens": 809725378.0, + "step": 21223 + }, + { + "epoch": 2.6999109528049865, + "grad_norm": 1.7242294549942017, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8877224326133728, + "num_tokens": 809758727.0, + "step": 21224 + }, + { + "epoch": 2.700038163083577, + "grad_norm": 1.554184079170227, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8814406394958496, + "num_tokens": 809800000.0, + "step": 21225 + }, + { + "epoch": 2.7001653733621676, + "grad_norm": 1.5681650638580322, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8813610076904297, + "num_tokens": 809840661.0, + "step": 21226 + }, + { + "epoch": 2.700292583640758, + "grad_norm": 1.6996132135391235, + "learning_rate": 1e-06, + "loss": 0.2788, + "mean_token_accuracy": 0.8985984325408936, + "num_tokens": 809872866.0, + "step": 21227 + }, + { + "epoch": 2.7004197939193486, + "grad_norm": 1.582065463066101, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8790658712387085, + "num_tokens": 809912047.0, + "step": 21228 + }, + { + "epoch": 2.700547004197939, + "grad_norm": 1.5628862380981445, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8819538354873657, + "num_tokens": 809951913.0, + "step": 21229 + }, + { + "epoch": 2.7006742144765297, + "grad_norm": 1.5470253229141235, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.882647693157196, + "num_tokens": 809991703.0, + "step": 21230 + }, + { + "epoch": 2.70080142475512, + "grad_norm": 1.4888477325439453, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8893030881881714, + "num_tokens": 810033483.0, + "step": 21231 + }, + { + "epoch": 2.7009286350337107, + "grad_norm": 1.663103699684143, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8910389542579651, + "num_tokens": 810065000.0, + "step": 21232 + }, + { + "epoch": 2.7010558453123013, + "grad_norm": 1.5162016153335571, + "learning_rate": 1e-06, + "loss": 0.28, + "mean_token_accuracy": 0.8960776329040527, + "num_tokens": 810103383.0, + "step": 21233 + }, + { + "epoch": 2.701183055590892, + "grad_norm": 1.4725333452224731, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8680734634399414, + "num_tokens": 810146141.0, + "step": 21234 + }, + { + "epoch": 2.7013102658694823, + "grad_norm": 1.511688232421875, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8841403722763062, + "num_tokens": 810191595.0, + "step": 21235 + }, + { + "epoch": 2.701437476148073, + "grad_norm": 1.457397699356079, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8892529010772705, + "num_tokens": 810230251.0, + "step": 21236 + }, + { + "epoch": 2.7015646864266634, + "grad_norm": 1.4933611154556274, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8896657228469849, + "num_tokens": 810270364.0, + "step": 21237 + }, + { + "epoch": 2.701691896705254, + "grad_norm": 1.5352873802185059, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8864871263504028, + "num_tokens": 810308871.0, + "step": 21238 + }, + { + "epoch": 2.7018191069838444, + "grad_norm": 1.5577739477157593, + "learning_rate": 1e-06, + "loss": 0.2838, + "mean_token_accuracy": 0.8945222496986389, + "num_tokens": 810345308.0, + "step": 21239 + }, + { + "epoch": 2.701946317262435, + "grad_norm": 1.581346035003662, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8945878744125366, + "num_tokens": 810379037.0, + "step": 21240 + }, + { + "epoch": 2.7020735275410255, + "grad_norm": 1.5052666664123535, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8866218328475952, + "num_tokens": 810418232.0, + "step": 21241 + }, + { + "epoch": 2.7022007378196156, + "grad_norm": 1.5018025636672974, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8930010199546814, + "num_tokens": 810456159.0, + "step": 21242 + }, + { + "epoch": 2.7023279480982065, + "grad_norm": 1.51911461353302, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8870303630828857, + "num_tokens": 810496717.0, + "step": 21243 + }, + { + "epoch": 2.7024551583767966, + "grad_norm": 1.8007031679153442, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8766145706176758, + "num_tokens": 810530266.0, + "step": 21244 + }, + { + "epoch": 2.7025823686553876, + "grad_norm": 1.5635414123535156, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.880842387676239, + "num_tokens": 810569536.0, + "step": 21245 + }, + { + "epoch": 2.7027095789339777, + "grad_norm": 1.625093698501587, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8859939575195312, + "num_tokens": 810604296.0, + "step": 21246 + }, + { + "epoch": 2.7028367892125686, + "grad_norm": 1.6325033903121948, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8872691988945007, + "num_tokens": 810642452.0, + "step": 21247 + }, + { + "epoch": 2.7029639994911587, + "grad_norm": 1.6350042819976807, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8851456046104431, + "num_tokens": 810676915.0, + "step": 21248 + }, + { + "epoch": 2.7030912097697493, + "grad_norm": 1.6880645751953125, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8825836777687073, + "num_tokens": 810711028.0, + "step": 21249 + }, + { + "epoch": 2.70321842004834, + "grad_norm": 1.5875097513198853, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.888050377368927, + "num_tokens": 810749818.0, + "step": 21250 + }, + { + "epoch": 2.7033456303269303, + "grad_norm": 1.68959379196167, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8798720836639404, + "num_tokens": 810783486.0, + "step": 21251 + }, + { + "epoch": 2.703472840605521, + "grad_norm": 1.4332621097564697, + "learning_rate": 1e-06, + "loss": 0.2788, + "mean_token_accuracy": 0.8992129564285278, + "num_tokens": 810827732.0, + "step": 21252 + }, + { + "epoch": 2.7036000508841114, + "grad_norm": 1.576838731765747, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.872341513633728, + "num_tokens": 810867496.0, + "step": 21253 + }, + { + "epoch": 2.703727261162702, + "grad_norm": 1.4959138631820679, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8870159387588501, + "num_tokens": 810906874.0, + "step": 21254 + }, + { + "epoch": 2.7038544714412924, + "grad_norm": 1.4846347570419312, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8896668553352356, + "num_tokens": 810950765.0, + "step": 21255 + }, + { + "epoch": 2.703981681719883, + "grad_norm": 1.6424216032028198, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8741387128829956, + "num_tokens": 810988782.0, + "step": 21256 + }, + { + "epoch": 2.7041088919984735, + "grad_norm": 1.6947410106658936, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8878562450408936, + "num_tokens": 811021165.0, + "step": 21257 + }, + { + "epoch": 2.704236102277064, + "grad_norm": 1.491671085357666, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.892012357711792, + "num_tokens": 811059603.0, + "step": 21258 + }, + { + "epoch": 2.7043633125556545, + "grad_norm": 1.4571607112884521, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8827040791511536, + "num_tokens": 811103719.0, + "step": 21259 + }, + { + "epoch": 2.704490522834245, + "grad_norm": 1.4538651704788208, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8851355314254761, + "num_tokens": 811147168.0, + "step": 21260 + }, + { + "epoch": 2.7046177331128356, + "grad_norm": 1.6352020502090454, + "learning_rate": 1e-06, + "loss": 0.2809, + "mean_token_accuracy": 0.8999025225639343, + "num_tokens": 811180536.0, + "step": 21261 + }, + { + "epoch": 2.704744943391426, + "grad_norm": 1.5524489879608154, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8864129185676575, + "num_tokens": 811218481.0, + "step": 21262 + }, + { + "epoch": 2.7048721536700167, + "grad_norm": 1.5344717502593994, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8825051784515381, + "num_tokens": 811255422.0, + "step": 21263 + }, + { + "epoch": 2.704999363948607, + "grad_norm": 1.4688469171524048, + "learning_rate": 1e-06, + "loss": 0.2661, + "mean_token_accuracy": 0.9039390683174133, + "num_tokens": 811293960.0, + "step": 21264 + }, + { + "epoch": 2.7051265742271977, + "grad_norm": 1.488550066947937, + "learning_rate": 1e-06, + "loss": 0.2893, + "mean_token_accuracy": 0.897267758846283, + "num_tokens": 811335236.0, + "step": 21265 + }, + { + "epoch": 2.7052537845057882, + "grad_norm": 1.5490785837173462, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.888992190361023, + "num_tokens": 811376381.0, + "step": 21266 + }, + { + "epoch": 2.7053809947843783, + "grad_norm": 1.5642614364624023, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8850208520889282, + "num_tokens": 811416706.0, + "step": 21267 + }, + { + "epoch": 2.7055082050629693, + "grad_norm": 1.4436548948287964, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8884847164154053, + "num_tokens": 811460322.0, + "step": 21268 + }, + { + "epoch": 2.7056354153415594, + "grad_norm": 1.5433586835861206, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8898340463638306, + "num_tokens": 811501078.0, + "step": 21269 + }, + { + "epoch": 2.7057626256201504, + "grad_norm": 1.5090234279632568, + "learning_rate": 1e-06, + "loss": 0.2753, + "mean_token_accuracy": 0.9013280868530273, + "num_tokens": 811534959.0, + "step": 21270 + }, + { + "epoch": 2.7058898358987404, + "grad_norm": 1.5617839097976685, + "learning_rate": 1e-06, + "loss": 0.2883, + "mean_token_accuracy": 0.8946939706802368, + "num_tokens": 811570283.0, + "step": 21271 + }, + { + "epoch": 2.706017046177331, + "grad_norm": 1.6699048280715942, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8853827714920044, + "num_tokens": 811605025.0, + "step": 21272 + }, + { + "epoch": 2.7061442564559215, + "grad_norm": 1.621166706085205, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8837430477142334, + "num_tokens": 811643207.0, + "step": 21273 + }, + { + "epoch": 2.706271466734512, + "grad_norm": 1.5515040159225464, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8734086155891418, + "num_tokens": 811682143.0, + "step": 21274 + }, + { + "epoch": 2.7063986770131025, + "grad_norm": 1.4966213703155518, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8864560127258301, + "num_tokens": 811721551.0, + "step": 21275 + }, + { + "epoch": 2.706525887291693, + "grad_norm": 1.6635196208953857, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8781865835189819, + "num_tokens": 811756894.0, + "step": 21276 + }, + { + "epoch": 2.7066530975702836, + "grad_norm": 1.5009353160858154, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8891721963882446, + "num_tokens": 811795549.0, + "step": 21277 + }, + { + "epoch": 2.706780307848874, + "grad_norm": 1.4423706531524658, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8804078102111816, + "num_tokens": 811840296.0, + "step": 21278 + }, + { + "epoch": 2.7069075181274647, + "grad_norm": 1.4349907636642456, + "learning_rate": 1e-06, + "loss": 0.2715, + "mean_token_accuracy": 0.9020960330963135, + "num_tokens": 811877750.0, + "step": 21279 + }, + { + "epoch": 2.707034728406055, + "grad_norm": 1.8172687292099, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8914892077445984, + "num_tokens": 811902985.0, + "step": 21280 + }, + { + "epoch": 2.7071619386846457, + "grad_norm": 1.5960681438446045, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8810173869132996, + "num_tokens": 811939756.0, + "step": 21281 + }, + { + "epoch": 2.7072891489632362, + "grad_norm": 1.580399751663208, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8889326453208923, + "num_tokens": 811978332.0, + "step": 21282 + }, + { + "epoch": 2.7074163592418268, + "grad_norm": 1.4194778203964233, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8857245445251465, + "num_tokens": 812022412.0, + "step": 21283 + }, + { + "epoch": 2.7075435695204173, + "grad_norm": 1.5803085565567017, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8861343860626221, + "num_tokens": 812059919.0, + "step": 21284 + }, + { + "epoch": 2.707670779799008, + "grad_norm": 1.561570405960083, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8799065351486206, + "num_tokens": 812098996.0, + "step": 21285 + }, + { + "epoch": 2.7077979900775984, + "grad_norm": 1.758592128753662, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8780596852302551, + "num_tokens": 812134207.0, + "step": 21286 + }, + { + "epoch": 2.707925200356189, + "grad_norm": 1.560265064239502, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8941895961761475, + "num_tokens": 812169231.0, + "step": 21287 + }, + { + "epoch": 2.7080524106347794, + "grad_norm": 1.548215627670288, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8894708156585693, + "num_tokens": 812209533.0, + "step": 21288 + }, + { + "epoch": 2.70817962091337, + "grad_norm": 1.783624529838562, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.873801589012146, + "num_tokens": 812241118.0, + "step": 21289 + }, + { + "epoch": 2.7083068311919605, + "grad_norm": 1.6846954822540283, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8940584659576416, + "num_tokens": 812276325.0, + "step": 21290 + }, + { + "epoch": 2.708434041470551, + "grad_norm": 1.4759235382080078, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8932124972343445, + "num_tokens": 812319671.0, + "step": 21291 + }, + { + "epoch": 2.708561251749141, + "grad_norm": 1.7247649431228638, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8802154660224915, + "num_tokens": 812352916.0, + "step": 21292 + }, + { + "epoch": 2.708688462027732, + "grad_norm": 1.5360510349273682, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8856988549232483, + "num_tokens": 812392040.0, + "step": 21293 + }, + { + "epoch": 2.708815672306322, + "grad_norm": 1.620575189590454, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8815762996673584, + "num_tokens": 812429002.0, + "step": 21294 + }, + { + "epoch": 2.708942882584913, + "grad_norm": 1.6186715364456177, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8907920122146606, + "num_tokens": 812464133.0, + "step": 21295 + }, + { + "epoch": 2.709070092863503, + "grad_norm": 1.5747038125991821, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8912143111228943, + "num_tokens": 812503143.0, + "step": 21296 + }, + { + "epoch": 2.7091973031420937, + "grad_norm": 1.488092303276062, + "learning_rate": 1e-06, + "loss": 0.285, + "mean_token_accuracy": 0.8943747282028198, + "num_tokens": 812546247.0, + "step": 21297 + }, + { + "epoch": 2.7093245134206843, + "grad_norm": 1.6914453506469727, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8869195580482483, + "num_tokens": 812583881.0, + "step": 21298 + }, + { + "epoch": 2.709451723699275, + "grad_norm": 1.6279656887054443, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8870295882225037, + "num_tokens": 812618892.0, + "step": 21299 + }, + { + "epoch": 2.7095789339778653, + "grad_norm": 1.6390490531921387, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.8941434621810913, + "num_tokens": 812650277.0, + "step": 21300 + }, + { + "epoch": 2.709706144256456, + "grad_norm": 1.454506754875183, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.89090895652771, + "num_tokens": 812690909.0, + "step": 21301 + }, + { + "epoch": 2.7098333545350464, + "grad_norm": 1.508098840713501, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.875912070274353, + "num_tokens": 812731749.0, + "step": 21302 + }, + { + "epoch": 2.709960564813637, + "grad_norm": 1.5600730180740356, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8934847116470337, + "num_tokens": 812769861.0, + "step": 21303 + }, + { + "epoch": 2.7100877750922274, + "grad_norm": 1.4351520538330078, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8917754888534546, + "num_tokens": 812807759.0, + "step": 21304 + }, + { + "epoch": 2.710214985370818, + "grad_norm": 1.5140233039855957, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8822335600852966, + "num_tokens": 812849477.0, + "step": 21305 + }, + { + "epoch": 2.7103421956494085, + "grad_norm": 1.5395830869674683, + "learning_rate": 1e-06, + "loss": 0.271, + "mean_token_accuracy": 0.9052029252052307, + "num_tokens": 812883109.0, + "step": 21306 + }, + { + "epoch": 2.710469405927999, + "grad_norm": 1.6302534341812134, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8883600831031799, + "num_tokens": 812915580.0, + "step": 21307 + }, + { + "epoch": 2.7105966162065895, + "grad_norm": 1.7276721000671387, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8798160552978516, + "num_tokens": 812950447.0, + "step": 21308 + }, + { + "epoch": 2.71072382648518, + "grad_norm": 1.4706319570541382, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8916115760803223, + "num_tokens": 812993383.0, + "step": 21309 + }, + { + "epoch": 2.7108510367637706, + "grad_norm": 1.4431922435760498, + "learning_rate": 1e-06, + "loss": 0.2712, + "mean_token_accuracy": 0.8986403942108154, + "num_tokens": 813031850.0, + "step": 21310 + }, + { + "epoch": 2.710978247042361, + "grad_norm": 1.5622190237045288, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.885812520980835, + "num_tokens": 813067859.0, + "step": 21311 + }, + { + "epoch": 2.7111054573209517, + "grad_norm": 1.50275456905365, + "learning_rate": 1e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.8948245644569397, + "num_tokens": 813105646.0, + "step": 21312 + }, + { + "epoch": 2.711232667599542, + "grad_norm": 1.468704342842102, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8930656909942627, + "num_tokens": 813144467.0, + "step": 21313 + }, + { + "epoch": 2.7113598778781327, + "grad_norm": 1.5030914545059204, + "learning_rate": 1e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.8958148956298828, + "num_tokens": 813178536.0, + "step": 21314 + }, + { + "epoch": 2.711487088156723, + "grad_norm": 1.4624783992767334, + "learning_rate": 1e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.8973029851913452, + "num_tokens": 813216315.0, + "step": 21315 + }, + { + "epoch": 2.7116142984353138, + "grad_norm": 1.3668344020843506, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8857393264770508, + "num_tokens": 813261139.0, + "step": 21316 + }, + { + "epoch": 2.711741508713904, + "grad_norm": 1.5849231481552124, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.881804883480072, + "num_tokens": 813297938.0, + "step": 21317 + }, + { + "epoch": 2.711868718992495, + "grad_norm": 1.454168438911438, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8901799917221069, + "num_tokens": 813339485.0, + "step": 21318 + }, + { + "epoch": 2.711995929271085, + "grad_norm": 1.63359797000885, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8735524415969849, + "num_tokens": 813376519.0, + "step": 21319 + }, + { + "epoch": 2.712123139549676, + "grad_norm": 1.5140221118927002, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8853301405906677, + "num_tokens": 813415732.0, + "step": 21320 + }, + { + "epoch": 2.712250349828266, + "grad_norm": 1.52525794506073, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.882396936416626, + "num_tokens": 813458115.0, + "step": 21321 + }, + { + "epoch": 2.7123775601068565, + "grad_norm": 1.6716783046722412, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8763612508773804, + "num_tokens": 813497225.0, + "step": 21322 + }, + { + "epoch": 2.712504770385447, + "grad_norm": 1.5695300102233887, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.879139244556427, + "num_tokens": 813534796.0, + "step": 21323 + }, + { + "epoch": 2.7126319806640375, + "grad_norm": 1.6170860528945923, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.8939516544342041, + "num_tokens": 813567196.0, + "step": 21324 + }, + { + "epoch": 2.712759190942628, + "grad_norm": 1.6168582439422607, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8683928847312927, + "num_tokens": 813606059.0, + "step": 21325 + }, + { + "epoch": 2.7128864012212186, + "grad_norm": 1.508100986480713, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8871709704399109, + "num_tokens": 813647462.0, + "step": 21326 + }, + { + "epoch": 2.713013611499809, + "grad_norm": 1.6112040281295776, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.8973871469497681, + "num_tokens": 813689404.0, + "step": 21327 + }, + { + "epoch": 2.7131408217783997, + "grad_norm": 1.5896902084350586, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8901033401489258, + "num_tokens": 813724464.0, + "step": 21328 + }, + { + "epoch": 2.71326803205699, + "grad_norm": 1.4984991550445557, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8942725658416748, + "num_tokens": 813762888.0, + "step": 21329 + }, + { + "epoch": 2.7133952423355807, + "grad_norm": 1.7949000597000122, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8869547843933105, + "num_tokens": 813803316.0, + "step": 21330 + }, + { + "epoch": 2.7135224526141712, + "grad_norm": 1.6493512392044067, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8663002252578735, + "num_tokens": 813838483.0, + "step": 21331 + }, + { + "epoch": 2.7136496628927618, + "grad_norm": 1.8110803365707397, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8708152770996094, + "num_tokens": 813873690.0, + "step": 21332 + }, + { + "epoch": 2.7137768731713523, + "grad_norm": 1.5670267343521118, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8865687251091003, + "num_tokens": 813910337.0, + "step": 21333 + }, + { + "epoch": 2.713904083449943, + "grad_norm": 1.5385218858718872, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8788800239562988, + "num_tokens": 813948342.0, + "step": 21334 + }, + { + "epoch": 2.7140312937285334, + "grad_norm": 1.4864422082901, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8964359760284424, + "num_tokens": 813983851.0, + "step": 21335 + }, + { + "epoch": 2.714158504007124, + "grad_norm": 1.4003496170043945, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8928433656692505, + "num_tokens": 814028986.0, + "step": 21336 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 1.628265142440796, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.88298100233078, + "num_tokens": 814065327.0, + "step": 21337 + }, + { + "epoch": 2.714412924564305, + "grad_norm": 1.5888447761535645, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.885114848613739, + "num_tokens": 814104431.0, + "step": 21338 + }, + { + "epoch": 2.7145401348428955, + "grad_norm": 1.4496632814407349, + "learning_rate": 1e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.8981108069419861, + "num_tokens": 814143608.0, + "step": 21339 + }, + { + "epoch": 2.7146673451214856, + "grad_norm": 1.7520164251327515, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8840543031692505, + "num_tokens": 814176108.0, + "step": 21340 + }, + { + "epoch": 2.7147945554000765, + "grad_norm": 1.4798269271850586, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8873007297515869, + "num_tokens": 814219429.0, + "step": 21341 + }, + { + "epoch": 2.7149217656786666, + "grad_norm": 1.6298800706863403, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.8927834033966064, + "num_tokens": 814258259.0, + "step": 21342 + }, + { + "epoch": 2.7150489759572576, + "grad_norm": 1.6980232000350952, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8777628540992737, + "num_tokens": 814293097.0, + "step": 21343 + }, + { + "epoch": 2.7151761862358477, + "grad_norm": 1.62738037109375, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8946628570556641, + "num_tokens": 814324854.0, + "step": 21344 + }, + { + "epoch": 2.7153033965144386, + "grad_norm": 1.4709832668304443, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8849684596061707, + "num_tokens": 814368474.0, + "step": 21345 + }, + { + "epoch": 2.7154306067930287, + "grad_norm": 1.5985143184661865, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8900607824325562, + "num_tokens": 814402946.0, + "step": 21346 + }, + { + "epoch": 2.7155578170716193, + "grad_norm": 1.550218105316162, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8867365717887878, + "num_tokens": 814446565.0, + "step": 21347 + }, + { + "epoch": 2.71568502735021, + "grad_norm": 1.5062209367752075, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.893640398979187, + "num_tokens": 814486569.0, + "step": 21348 + }, + { + "epoch": 2.7158122376288003, + "grad_norm": 1.758256196975708, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8793638944625854, + "num_tokens": 814519379.0, + "step": 21349 + }, + { + "epoch": 2.715939447907391, + "grad_norm": 1.5763269662857056, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8810766935348511, + "num_tokens": 814560763.0, + "step": 21350 + }, + { + "epoch": 2.7160666581859814, + "grad_norm": 1.6671603918075562, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.872399091720581, + "num_tokens": 814594696.0, + "step": 21351 + }, + { + "epoch": 2.716193868464572, + "grad_norm": 1.549322247505188, + "learning_rate": 1e-06, + "loss": 0.2769, + "mean_token_accuracy": 0.8999388217926025, + "num_tokens": 814628526.0, + "step": 21352 + }, + { + "epoch": 2.7163210787431624, + "grad_norm": 1.3925502300262451, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8896865844726562, + "num_tokens": 814673034.0, + "step": 21353 + }, + { + "epoch": 2.716448289021753, + "grad_norm": 1.6219184398651123, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8842253684997559, + "num_tokens": 814710059.0, + "step": 21354 + }, + { + "epoch": 2.7165754993003435, + "grad_norm": 1.6010725498199463, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8852369785308838, + "num_tokens": 814746323.0, + "step": 21355 + }, + { + "epoch": 2.716702709578934, + "grad_norm": 1.6075057983398438, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8857740163803101, + "num_tokens": 814782143.0, + "step": 21356 + }, + { + "epoch": 2.7168299198575245, + "grad_norm": 1.6341643333435059, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8915448188781738, + "num_tokens": 814816239.0, + "step": 21357 + }, + { + "epoch": 2.716957130136115, + "grad_norm": 1.47881281375885, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8792644739151001, + "num_tokens": 814858772.0, + "step": 21358 + }, + { + "epoch": 2.7170843404147056, + "grad_norm": 1.4676774740219116, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8928122520446777, + "num_tokens": 814898241.0, + "step": 21359 + }, + { + "epoch": 2.717211550693296, + "grad_norm": 1.5351381301879883, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8824831247329712, + "num_tokens": 814936670.0, + "step": 21360 + }, + { + "epoch": 2.7173387609718866, + "grad_norm": 1.5574685335159302, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8882541656494141, + "num_tokens": 814974511.0, + "step": 21361 + }, + { + "epoch": 2.717465971250477, + "grad_norm": 1.5442767143249512, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.887647271156311, + "num_tokens": 815016025.0, + "step": 21362 + }, + { + "epoch": 2.7175931815290677, + "grad_norm": 1.6545209884643555, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8692983388900757, + "num_tokens": 815055291.0, + "step": 21363 + }, + { + "epoch": 2.7177203918076582, + "grad_norm": 1.4749220609664917, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.894635796546936, + "num_tokens": 815095654.0, + "step": 21364 + }, + { + "epoch": 2.7178476020862483, + "grad_norm": 1.4449965953826904, + "learning_rate": 1e-06, + "loss": 0.2749, + "mean_token_accuracy": 0.9007886648178101, + "num_tokens": 815134078.0, + "step": 21365 + }, + { + "epoch": 2.7179748123648393, + "grad_norm": 1.3856520652770996, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8880379796028137, + "num_tokens": 815181319.0, + "step": 21366 + }, + { + "epoch": 2.7181020226434294, + "grad_norm": 1.5506184101104736, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8898651003837585, + "num_tokens": 815219457.0, + "step": 21367 + }, + { + "epoch": 2.7182292329220203, + "grad_norm": 1.5981945991516113, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8799993991851807, + "num_tokens": 815257031.0, + "step": 21368 + }, + { + "epoch": 2.7183564432006104, + "grad_norm": 1.3555502891540527, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8979665040969849, + "num_tokens": 815301884.0, + "step": 21369 + }, + { + "epoch": 2.718483653479201, + "grad_norm": 1.4902766942977905, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8783851265907288, + "num_tokens": 815346544.0, + "step": 21370 + }, + { + "epoch": 2.7186108637577915, + "grad_norm": 1.5040897130966187, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8894718885421753, + "num_tokens": 815384474.0, + "step": 21371 + }, + { + "epoch": 2.718738074036382, + "grad_norm": 1.5372531414031982, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8808448314666748, + "num_tokens": 815425124.0, + "step": 21372 + }, + { + "epoch": 2.7188652843149725, + "grad_norm": 1.5867782831192017, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8711408376693726, + "num_tokens": 815465081.0, + "step": 21373 + }, + { + "epoch": 2.718992494593563, + "grad_norm": 1.51424241065979, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8778927326202393, + "num_tokens": 815505000.0, + "step": 21374 + }, + { + "epoch": 2.7191197048721536, + "grad_norm": 1.5855358839035034, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8871515989303589, + "num_tokens": 815542274.0, + "step": 21375 + }, + { + "epoch": 2.719246915150744, + "grad_norm": 1.6107620000839233, + "learning_rate": 1e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.8941125869750977, + "num_tokens": 815573860.0, + "step": 21376 + }, + { + "epoch": 2.7193741254293347, + "grad_norm": 1.6535917520523071, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8690991401672363, + "num_tokens": 815612518.0, + "step": 21377 + }, + { + "epoch": 2.719501335707925, + "grad_norm": 1.4085888862609863, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8868491649627686, + "num_tokens": 815658285.0, + "step": 21378 + }, + { + "epoch": 2.7196285459865157, + "grad_norm": 1.5719294548034668, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8828771114349365, + "num_tokens": 815693740.0, + "step": 21379 + }, + { + "epoch": 2.7197557562651062, + "grad_norm": 1.5048677921295166, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.8962621688842773, + "num_tokens": 815731333.0, + "step": 21380 + }, + { + "epoch": 2.7198829665436968, + "grad_norm": 1.366452932357788, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8871883153915405, + "num_tokens": 815776989.0, + "step": 21381 + }, + { + "epoch": 2.7200101768222873, + "grad_norm": 1.3792229890823364, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8884904980659485, + "num_tokens": 815824272.0, + "step": 21382 + }, + { + "epoch": 2.720137387100878, + "grad_norm": 1.5992119312286377, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8823332786560059, + "num_tokens": 815862959.0, + "step": 21383 + }, + { + "epoch": 2.7202645973794684, + "grad_norm": 1.443623423576355, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8901733160018921, + "num_tokens": 815906727.0, + "step": 21384 + }, + { + "epoch": 2.720391807658059, + "grad_norm": 1.548476219177246, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8648315668106079, + "num_tokens": 815947191.0, + "step": 21385 + }, + { + "epoch": 2.7205190179366494, + "grad_norm": 1.5329017639160156, + "learning_rate": 1e-06, + "loss": 0.2589, + "mean_token_accuracy": 0.9068995714187622, + "num_tokens": 815983136.0, + "step": 21386 + }, + { + "epoch": 2.72064622821524, + "grad_norm": 1.4181458950042725, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8825254440307617, + "num_tokens": 816027877.0, + "step": 21387 + }, + { + "epoch": 2.7207734384938305, + "grad_norm": 1.6978814601898193, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8743982911109924, + "num_tokens": 816069398.0, + "step": 21388 + }, + { + "epoch": 2.720900648772421, + "grad_norm": 1.839050531387329, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8886951208114624, + "num_tokens": 816102993.0, + "step": 21389 + }, + { + "epoch": 2.721027859051011, + "grad_norm": 1.4456783533096313, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8930018544197083, + "num_tokens": 816144420.0, + "step": 21390 + }, + { + "epoch": 2.721155069329602, + "grad_norm": 1.7483479976654053, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8814038038253784, + "num_tokens": 816175879.0, + "step": 21391 + }, + { + "epoch": 2.721282279608192, + "grad_norm": 1.7159521579742432, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8896852731704712, + "num_tokens": 816206067.0, + "step": 21392 + }, + { + "epoch": 2.721409489886783, + "grad_norm": 1.74097740650177, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8785430192947388, + "num_tokens": 816240002.0, + "step": 21393 + }, + { + "epoch": 2.721536700165373, + "grad_norm": 1.4346668720245361, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8919936418533325, + "num_tokens": 816283989.0, + "step": 21394 + }, + { + "epoch": 2.7216639104439637, + "grad_norm": 1.5535956621170044, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8895313143730164, + "num_tokens": 816322628.0, + "step": 21395 + }, + { + "epoch": 2.7217911207225542, + "grad_norm": 1.5202797651290894, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8820388317108154, + "num_tokens": 816363017.0, + "step": 21396 + }, + { + "epoch": 2.7219183310011448, + "grad_norm": 1.6358612775802612, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8883665800094604, + "num_tokens": 816398056.0, + "step": 21397 + }, + { + "epoch": 2.7220455412797353, + "grad_norm": 1.7551103830337524, + "learning_rate": 1e-06, + "loss": 0.2888, + "mean_token_accuracy": 0.8954710960388184, + "num_tokens": 816435015.0, + "step": 21398 + }, + { + "epoch": 2.722172751558326, + "grad_norm": 1.4758082628250122, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8928701877593994, + "num_tokens": 816477800.0, + "step": 21399 + }, + { + "epoch": 2.7222999618369164, + "grad_norm": 1.4375678300857544, + "learning_rate": 1e-06, + "loss": 0.2798, + "mean_token_accuracy": 0.8981736898422241, + "num_tokens": 816516685.0, + "step": 21400 + }, + { + "epoch": 2.722427172115507, + "grad_norm": 1.7182050943374634, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8791605830192566, + "num_tokens": 816553451.0, + "step": 21401 + }, + { + "epoch": 2.7225543823940974, + "grad_norm": 1.7457822561264038, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8885119557380676, + "num_tokens": 816583598.0, + "step": 21402 + }, + { + "epoch": 2.722681592672688, + "grad_norm": 1.5672589540481567, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8859918117523193, + "num_tokens": 816622605.0, + "step": 21403 + }, + { + "epoch": 2.7228088029512785, + "grad_norm": 1.7635680437088013, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8799058794975281, + "num_tokens": 816658100.0, + "step": 21404 + }, + { + "epoch": 2.722936013229869, + "grad_norm": 1.7304651737213135, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8933370113372803, + "num_tokens": 816690973.0, + "step": 21405 + }, + { + "epoch": 2.7230632235084595, + "grad_norm": 1.5188096761703491, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8923307061195374, + "num_tokens": 816730164.0, + "step": 21406 + }, + { + "epoch": 2.72319043378705, + "grad_norm": 1.4527208805084229, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8950483798980713, + "num_tokens": 816770212.0, + "step": 21407 + }, + { + "epoch": 2.7233176440656406, + "grad_norm": 1.5371782779693604, + "learning_rate": 1e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.896767795085907, + "num_tokens": 816804438.0, + "step": 21408 + }, + { + "epoch": 2.723444854344231, + "grad_norm": 1.4544564485549927, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8702331781387329, + "num_tokens": 816851126.0, + "step": 21409 + }, + { + "epoch": 2.7235720646228216, + "grad_norm": 1.5524142980575562, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8806262016296387, + "num_tokens": 816891440.0, + "step": 21410 + }, + { + "epoch": 2.723699274901412, + "grad_norm": 1.6375328302383423, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8761675357818604, + "num_tokens": 816931762.0, + "step": 21411 + }, + { + "epoch": 2.7238264851800027, + "grad_norm": 1.6603962182998657, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8793439865112305, + "num_tokens": 816964582.0, + "step": 21412 + }, + { + "epoch": 2.723953695458593, + "grad_norm": 1.6093218326568604, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8606002330780029, + "num_tokens": 817008610.0, + "step": 21413 + }, + { + "epoch": 2.7240809057371838, + "grad_norm": 1.60444176197052, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8859061002731323, + "num_tokens": 817044619.0, + "step": 21414 + }, + { + "epoch": 2.724208116015774, + "grad_norm": 1.461523413658142, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8910826444625854, + "num_tokens": 817085688.0, + "step": 21415 + }, + { + "epoch": 2.724335326294365, + "grad_norm": 1.6001217365264893, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8919662833213806, + "num_tokens": 817119117.0, + "step": 21416 + }, + { + "epoch": 2.724462536572955, + "grad_norm": 1.568230152130127, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.886600136756897, + "num_tokens": 817153477.0, + "step": 21417 + }, + { + "epoch": 2.724589746851546, + "grad_norm": 1.6325691938400269, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8922238349914551, + "num_tokens": 817187082.0, + "step": 21418 + }, + { + "epoch": 2.724716957130136, + "grad_norm": 1.6298493146896362, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8889882564544678, + "num_tokens": 817222826.0, + "step": 21419 + }, + { + "epoch": 2.7248441674087265, + "grad_norm": 1.6236255168914795, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8872057199478149, + "num_tokens": 817259395.0, + "step": 21420 + }, + { + "epoch": 2.724971377687317, + "grad_norm": 1.6235029697418213, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8745362758636475, + "num_tokens": 817296456.0, + "step": 21421 + }, + { + "epoch": 2.7250985879659075, + "grad_norm": 1.6572457551956177, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8851685523986816, + "num_tokens": 817330838.0, + "step": 21422 + }, + { + "epoch": 2.725225798244498, + "grad_norm": 1.5813885927200317, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8771246671676636, + "num_tokens": 817371088.0, + "step": 21423 + }, + { + "epoch": 2.7253530085230886, + "grad_norm": 1.7168139219284058, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8737234473228455, + "num_tokens": 817407060.0, + "step": 21424 + }, + { + "epoch": 2.725480218801679, + "grad_norm": 1.5223281383514404, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8847621083259583, + "num_tokens": 817449540.0, + "step": 21425 + }, + { + "epoch": 2.7256074290802697, + "grad_norm": 1.519182562828064, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8812848925590515, + "num_tokens": 817488620.0, + "step": 21426 + }, + { + "epoch": 2.72573463935886, + "grad_norm": 1.3859251737594604, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8799083232879639, + "num_tokens": 817531985.0, + "step": 21427 + }, + { + "epoch": 2.7258618496374507, + "grad_norm": 1.5434699058532715, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8791701197624207, + "num_tokens": 817569076.0, + "step": 21428 + }, + { + "epoch": 2.7259890599160412, + "grad_norm": 1.7642933130264282, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8707144856452942, + "num_tokens": 817600617.0, + "step": 21429 + }, + { + "epoch": 2.7261162701946318, + "grad_norm": 1.4803816080093384, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8831367492675781, + "num_tokens": 817639759.0, + "step": 21430 + }, + { + "epoch": 2.7262434804732223, + "grad_norm": 1.573817253112793, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8829079866409302, + "num_tokens": 817677186.0, + "step": 21431 + }, + { + "epoch": 2.726370690751813, + "grad_norm": 1.678106427192688, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8698678016662598, + "num_tokens": 817713329.0, + "step": 21432 + }, + { + "epoch": 2.7264979010304033, + "grad_norm": 1.5348628759384155, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8841102123260498, + "num_tokens": 817752037.0, + "step": 21433 + }, + { + "epoch": 2.726625111308994, + "grad_norm": 1.632723331451416, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8773113489151001, + "num_tokens": 817788529.0, + "step": 21434 + }, + { + "epoch": 2.7267523215875844, + "grad_norm": 1.5230931043624878, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8869725465774536, + "num_tokens": 817828328.0, + "step": 21435 + }, + { + "epoch": 2.726879531866175, + "grad_norm": 1.5145137310028076, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8832107782363892, + "num_tokens": 817868182.0, + "step": 21436 + }, + { + "epoch": 2.7270067421447655, + "grad_norm": 1.548405647277832, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8810368776321411, + "num_tokens": 817907701.0, + "step": 21437 + }, + { + "epoch": 2.7271339524233555, + "grad_norm": 1.3917142152786255, + "learning_rate": 1e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.8972377777099609, + "num_tokens": 817949582.0, + "step": 21438 + }, + { + "epoch": 2.7272611627019465, + "grad_norm": 1.6348137855529785, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8828535676002502, + "num_tokens": 817984830.0, + "step": 21439 + }, + { + "epoch": 2.7273883729805366, + "grad_norm": 1.4958667755126953, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.8980291485786438, + "num_tokens": 818022130.0, + "step": 21440 + }, + { + "epoch": 2.7275155832591276, + "grad_norm": 1.3904328346252441, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8995945453643799, + "num_tokens": 818061414.0, + "step": 21441 + }, + { + "epoch": 2.7276427935377177, + "grad_norm": 1.7226804494857788, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8720080852508545, + "num_tokens": 818097049.0, + "step": 21442 + }, + { + "epoch": 2.7277700038163086, + "grad_norm": 1.5083168745040894, + "learning_rate": 1e-06, + "loss": 0.2832, + "mean_token_accuracy": 0.8962805271148682, + "num_tokens": 818131229.0, + "step": 21443 + }, + { + "epoch": 2.7278972140948987, + "grad_norm": 1.5364704132080078, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8731513619422913, + "num_tokens": 818172726.0, + "step": 21444 + }, + { + "epoch": 2.7280244243734892, + "grad_norm": 1.5779515504837036, + "learning_rate": 1e-06, + "loss": 0.2827, + "mean_token_accuracy": 0.8972765207290649, + "num_tokens": 818211243.0, + "step": 21445 + }, + { + "epoch": 2.7281516346520798, + "grad_norm": 1.6106780767440796, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.881482720375061, + "num_tokens": 818247467.0, + "step": 21446 + }, + { + "epoch": 2.7282788449306703, + "grad_norm": 1.565852403640747, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8770374059677124, + "num_tokens": 818286164.0, + "step": 21447 + }, + { + "epoch": 2.728406055209261, + "grad_norm": 1.478472113609314, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8825885057449341, + "num_tokens": 818326374.0, + "step": 21448 + }, + { + "epoch": 2.7285332654878514, + "grad_norm": 1.6073886156082153, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8896594047546387, + "num_tokens": 818362857.0, + "step": 21449 + }, + { + "epoch": 2.728660475766442, + "grad_norm": 1.5116972923278809, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8770313858985901, + "num_tokens": 818409379.0, + "step": 21450 + }, + { + "epoch": 2.7287876860450324, + "grad_norm": 1.5596344470977783, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8863665461540222, + "num_tokens": 818445819.0, + "step": 21451 + }, + { + "epoch": 2.728914896323623, + "grad_norm": 1.6349990367889404, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.893639326095581, + "num_tokens": 818478345.0, + "step": 21452 + }, + { + "epoch": 2.7290421066022135, + "grad_norm": 1.602186918258667, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.892421543598175, + "num_tokens": 818512210.0, + "step": 21453 + }, + { + "epoch": 2.729169316880804, + "grad_norm": 1.497126579284668, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8905069231987, + "num_tokens": 818551066.0, + "step": 21454 + }, + { + "epoch": 2.7292965271593945, + "grad_norm": 1.4702866077423096, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8944295644760132, + "num_tokens": 818590471.0, + "step": 21455 + }, + { + "epoch": 2.729423737437985, + "grad_norm": 1.5741801261901855, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8770281672477722, + "num_tokens": 818629628.0, + "step": 21456 + }, + { + "epoch": 2.7295509477165756, + "grad_norm": 1.6748430728912354, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8656964898109436, + "num_tokens": 818674926.0, + "step": 21457 + }, + { + "epoch": 2.729678157995166, + "grad_norm": 1.6244478225708008, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8858477473258972, + "num_tokens": 818713612.0, + "step": 21458 + }, + { + "epoch": 2.7298053682737566, + "grad_norm": 1.4766696691513062, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8842394948005676, + "num_tokens": 818755652.0, + "step": 21459 + }, + { + "epoch": 2.729932578552347, + "grad_norm": 1.582349181175232, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8839097023010254, + "num_tokens": 818796195.0, + "step": 21460 + }, + { + "epoch": 2.7300597888309377, + "grad_norm": 1.604396939277649, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8793532848358154, + "num_tokens": 818832653.0, + "step": 21461 + }, + { + "epoch": 2.7301869991095282, + "grad_norm": 1.8704978227615356, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.880745530128479, + "num_tokens": 818866755.0, + "step": 21462 + }, + { + "epoch": 2.7303142093881183, + "grad_norm": 1.6988327503204346, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8809390068054199, + "num_tokens": 818905601.0, + "step": 21463 + }, + { + "epoch": 2.7304414196667093, + "grad_norm": 1.6483980417251587, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8855048418045044, + "num_tokens": 818942058.0, + "step": 21464 + }, + { + "epoch": 2.7305686299452994, + "grad_norm": 1.5858380794525146, + "learning_rate": 1e-06, + "loss": 0.2866, + "mean_token_accuracy": 0.8974143266677856, + "num_tokens": 818980233.0, + "step": 21465 + }, + { + "epoch": 2.7306958402238903, + "grad_norm": 1.512096643447876, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8792201280593872, + "num_tokens": 819022521.0, + "step": 21466 + }, + { + "epoch": 2.7308230505024804, + "grad_norm": 1.4608745574951172, + "learning_rate": 1e-06, + "loss": 0.2823, + "mean_token_accuracy": 0.9007705450057983, + "num_tokens": 819060289.0, + "step": 21467 + }, + { + "epoch": 2.730950260781071, + "grad_norm": 1.635335087776184, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8784284591674805, + "num_tokens": 819095971.0, + "step": 21468 + }, + { + "epoch": 2.7310774710596615, + "grad_norm": 1.5318840742111206, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8935748934745789, + "num_tokens": 819133922.0, + "step": 21469 + }, + { + "epoch": 2.731204681338252, + "grad_norm": 1.476244330406189, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8881093263626099, + "num_tokens": 819176491.0, + "step": 21470 + }, + { + "epoch": 2.7313318916168425, + "grad_norm": 1.5841801166534424, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8680981993675232, + "num_tokens": 819219115.0, + "step": 21471 + }, + { + "epoch": 2.731459101895433, + "grad_norm": 1.5233235359191895, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.882258415222168, + "num_tokens": 819263583.0, + "step": 21472 + }, + { + "epoch": 2.7315863121740236, + "grad_norm": 1.630971908569336, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8723661303520203, + "num_tokens": 819301210.0, + "step": 21473 + }, + { + "epoch": 2.731713522452614, + "grad_norm": 1.6876691579818726, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8737943172454834, + "num_tokens": 819337889.0, + "step": 21474 + }, + { + "epoch": 2.7318407327312046, + "grad_norm": 1.5953943729400635, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8871294260025024, + "num_tokens": 819372559.0, + "step": 21475 + }, + { + "epoch": 2.731967943009795, + "grad_norm": 1.6059399843215942, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.880079448223114, + "num_tokens": 819410949.0, + "step": 21476 + }, + { + "epoch": 2.7320951532883857, + "grad_norm": 1.476163625717163, + "learning_rate": 1e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.8978649377822876, + "num_tokens": 819448541.0, + "step": 21477 + }, + { + "epoch": 2.7322223635669762, + "grad_norm": 1.6250828504562378, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8762543201446533, + "num_tokens": 819488310.0, + "step": 21478 + }, + { + "epoch": 2.7323495738455668, + "grad_norm": 1.5052886009216309, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8640539646148682, + "num_tokens": 819531182.0, + "step": 21479 + }, + { + "epoch": 2.7324767841241573, + "grad_norm": 1.4654768705368042, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8803887367248535, + "num_tokens": 819573185.0, + "step": 21480 + }, + { + "epoch": 2.732603994402748, + "grad_norm": 1.5618826150894165, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8852097392082214, + "num_tokens": 819610197.0, + "step": 21481 + }, + { + "epoch": 2.7327312046813383, + "grad_norm": 1.5646239519119263, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.882821798324585, + "num_tokens": 819648754.0, + "step": 21482 + }, + { + "epoch": 2.732858414959929, + "grad_norm": 1.5221471786499023, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8900716304779053, + "num_tokens": 819685886.0, + "step": 21483 + }, + { + "epoch": 2.7329856252385194, + "grad_norm": 1.4501880407333374, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8835161328315735, + "num_tokens": 819728348.0, + "step": 21484 + }, + { + "epoch": 2.73311283551711, + "grad_norm": 1.5629940032958984, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8841899633407593, + "num_tokens": 819768311.0, + "step": 21485 + }, + { + "epoch": 2.7332400457957005, + "grad_norm": 1.4187098741531372, + "learning_rate": 1e-06, + "loss": 0.2767, + "mean_token_accuracy": 0.902866780757904, + "num_tokens": 819808250.0, + "step": 21486 + }, + { + "epoch": 2.733367256074291, + "grad_norm": 1.5580873489379883, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8658574819564819, + "num_tokens": 819851121.0, + "step": 21487 + }, + { + "epoch": 2.733494466352881, + "grad_norm": 1.5097070932388306, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8911032676696777, + "num_tokens": 819889152.0, + "step": 21488 + }, + { + "epoch": 2.733621676631472, + "grad_norm": 1.4328266382217407, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8813881874084473, + "num_tokens": 819931907.0, + "step": 21489 + }, + { + "epoch": 2.733748886910062, + "grad_norm": 1.4772614240646362, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8816148638725281, + "num_tokens": 819979446.0, + "step": 21490 + }, + { + "epoch": 2.733876097188653, + "grad_norm": 1.5914785861968994, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.893997311592102, + "num_tokens": 820017767.0, + "step": 21491 + }, + { + "epoch": 2.734003307467243, + "grad_norm": 1.4948145151138306, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8935671448707581, + "num_tokens": 820057877.0, + "step": 21492 + }, + { + "epoch": 2.7341305177458337, + "grad_norm": 1.701648473739624, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8842673301696777, + "num_tokens": 820094793.0, + "step": 21493 + }, + { + "epoch": 2.7342577280244242, + "grad_norm": 1.4751453399658203, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8907724618911743, + "num_tokens": 820135289.0, + "step": 21494 + }, + { + "epoch": 2.7343849383030148, + "grad_norm": 1.5032615661621094, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8871574401855469, + "num_tokens": 820180075.0, + "step": 21495 + }, + { + "epoch": 2.7345121485816053, + "grad_norm": 1.7306301593780518, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8870515823364258, + "num_tokens": 820216813.0, + "step": 21496 + }, + { + "epoch": 2.734639358860196, + "grad_norm": 1.653685450553894, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8916245698928833, + "num_tokens": 820250311.0, + "step": 21497 + }, + { + "epoch": 2.7347665691387864, + "grad_norm": 1.4597235918045044, + "learning_rate": 1e-06, + "loss": 0.2734, + "mean_token_accuracy": 0.9037491083145142, + "num_tokens": 820289001.0, + "step": 21498 + }, + { + "epoch": 2.734893779417377, + "grad_norm": 1.5905964374542236, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8813788890838623, + "num_tokens": 820324708.0, + "step": 21499 + }, + { + "epoch": 2.7350209896959674, + "grad_norm": 1.6247261762619019, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8870677947998047, + "num_tokens": 820361673.0, + "step": 21500 + }, + { + "epoch": 2.735148199974558, + "grad_norm": 1.58635675907135, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.877295732498169, + "num_tokens": 820399875.0, + "step": 21501 + }, + { + "epoch": 2.7352754102531485, + "grad_norm": 1.5631494522094727, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8904123306274414, + "num_tokens": 820437333.0, + "step": 21502 + }, + { + "epoch": 2.735402620531739, + "grad_norm": 1.656996488571167, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8658645749092102, + "num_tokens": 820473912.0, + "step": 21503 + }, + { + "epoch": 2.7355298308103295, + "grad_norm": 1.597637414932251, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8880859613418579, + "num_tokens": 820511212.0, + "step": 21504 + }, + { + "epoch": 2.73565704108892, + "grad_norm": 1.5989311933517456, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.884261965751648, + "num_tokens": 820545738.0, + "step": 21505 + }, + { + "epoch": 2.7357842513675106, + "grad_norm": 1.5940279960632324, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8890694379806519, + "num_tokens": 820580554.0, + "step": 21506 + }, + { + "epoch": 2.735911461646101, + "grad_norm": 1.5266019105911255, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8874325752258301, + "num_tokens": 820618971.0, + "step": 21507 + }, + { + "epoch": 2.7360386719246916, + "grad_norm": 1.638897180557251, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8888180255889893, + "num_tokens": 820658043.0, + "step": 21508 + }, + { + "epoch": 2.736165882203282, + "grad_norm": 1.5166614055633545, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8831096887588501, + "num_tokens": 820697317.0, + "step": 21509 + }, + { + "epoch": 2.7362930924818727, + "grad_norm": 1.5780316591262817, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8934429883956909, + "num_tokens": 820731787.0, + "step": 21510 + }, + { + "epoch": 2.7364203027604628, + "grad_norm": 1.5506261587142944, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8866618871688843, + "num_tokens": 820773448.0, + "step": 21511 + }, + { + "epoch": 2.7365475130390537, + "grad_norm": 1.3189908266067505, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8884596824645996, + "num_tokens": 820821146.0, + "step": 21512 + }, + { + "epoch": 2.736674723317644, + "grad_norm": 1.4722570180892944, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8962578773498535, + "num_tokens": 820861787.0, + "step": 21513 + }, + { + "epoch": 2.736801933596235, + "grad_norm": 1.4412024021148682, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8836735486984253, + "num_tokens": 820906990.0, + "step": 21514 + }, + { + "epoch": 2.736929143874825, + "grad_norm": 1.5708985328674316, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8901630640029907, + "num_tokens": 820944059.0, + "step": 21515 + }, + { + "epoch": 2.737056354153416, + "grad_norm": 1.5563746690750122, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8785574436187744, + "num_tokens": 820983264.0, + "step": 21516 + }, + { + "epoch": 2.737183564432006, + "grad_norm": 1.5425398349761963, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8926169276237488, + "num_tokens": 821019834.0, + "step": 21517 + }, + { + "epoch": 2.7373107747105965, + "grad_norm": 1.6302310228347778, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8834282159805298, + "num_tokens": 821053568.0, + "step": 21518 + }, + { + "epoch": 2.737437984989187, + "grad_norm": 1.5973246097564697, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8836965560913086, + "num_tokens": 821089819.0, + "step": 21519 + }, + { + "epoch": 2.7375651952677775, + "grad_norm": 1.6329225301742554, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8813366293907166, + "num_tokens": 821127330.0, + "step": 21520 + }, + { + "epoch": 2.737692405546368, + "grad_norm": 1.6276276111602783, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8874034881591797, + "num_tokens": 821163451.0, + "step": 21521 + }, + { + "epoch": 2.7378196158249586, + "grad_norm": 1.5944063663482666, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8829488754272461, + "num_tokens": 821198719.0, + "step": 21522 + }, + { + "epoch": 2.737946826103549, + "grad_norm": 1.4478362798690796, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8807686567306519, + "num_tokens": 821243594.0, + "step": 21523 + }, + { + "epoch": 2.7380740363821396, + "grad_norm": 1.5088542699813843, + "learning_rate": 1e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.8958789110183716, + "num_tokens": 821279602.0, + "step": 21524 + }, + { + "epoch": 2.73820124666073, + "grad_norm": 1.4566885232925415, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8950657844543457, + "num_tokens": 821322046.0, + "step": 21525 + }, + { + "epoch": 2.7383284569393207, + "grad_norm": 1.5633196830749512, + "learning_rate": 1e-06, + "loss": 0.277, + "mean_token_accuracy": 0.8956893682479858, + "num_tokens": 821357598.0, + "step": 21526 + }, + { + "epoch": 2.7384556672179112, + "grad_norm": 1.6699044704437256, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8803672790527344, + "num_tokens": 821391315.0, + "step": 21527 + }, + { + "epoch": 2.7385828774965018, + "grad_norm": 1.5805565118789673, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8774394989013672, + "num_tokens": 821433868.0, + "step": 21528 + }, + { + "epoch": 2.7387100877750923, + "grad_norm": 1.3825088739395142, + "learning_rate": 1e-06, + "loss": 0.2753, + "mean_token_accuracy": 0.9016512632369995, + "num_tokens": 821477520.0, + "step": 21529 + }, + { + "epoch": 2.738837298053683, + "grad_norm": 1.778587818145752, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8734223246574402, + "num_tokens": 821509705.0, + "step": 21530 + }, + { + "epoch": 2.7389645083322733, + "grad_norm": 1.5991507768630981, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8778208494186401, + "num_tokens": 821549228.0, + "step": 21531 + }, + { + "epoch": 2.739091718610864, + "grad_norm": 1.5621540546417236, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8803278803825378, + "num_tokens": 821586872.0, + "step": 21532 + }, + { + "epoch": 2.7392189288894544, + "grad_norm": 1.6315288543701172, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8847247958183289, + "num_tokens": 821621699.0, + "step": 21533 + }, + { + "epoch": 2.739346139168045, + "grad_norm": 1.435971975326538, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8902403712272644, + "num_tokens": 821663182.0, + "step": 21534 + }, + { + "epoch": 2.7394733494466355, + "grad_norm": 1.5394539833068848, + "learning_rate": 1e-06, + "loss": 0.2746, + "mean_token_accuracy": 0.8964846134185791, + "num_tokens": 821700127.0, + "step": 21535 + }, + { + "epoch": 2.7396005597252255, + "grad_norm": 1.494497299194336, + "learning_rate": 1e-06, + "loss": 0.2602, + "mean_token_accuracy": 0.9051963090896606, + "num_tokens": 821734664.0, + "step": 21536 + }, + { + "epoch": 2.7397277700038165, + "grad_norm": 1.616446852684021, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8865854740142822, + "num_tokens": 821770868.0, + "step": 21537 + }, + { + "epoch": 2.7398549802824066, + "grad_norm": 1.4820494651794434, + "learning_rate": 1e-06, + "loss": 0.2751, + "mean_token_accuracy": 0.8990470170974731, + "num_tokens": 821812041.0, + "step": 21538 + }, + { + "epoch": 2.7399821905609976, + "grad_norm": 1.4400430917739868, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8921024799346924, + "num_tokens": 821851268.0, + "step": 21539 + }, + { + "epoch": 2.7401094008395877, + "grad_norm": 1.5895744562149048, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8870881199836731, + "num_tokens": 821886863.0, + "step": 21540 + }, + { + "epoch": 2.7402366111181786, + "grad_norm": 1.5612064599990845, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8847813606262207, + "num_tokens": 821925215.0, + "step": 21541 + }, + { + "epoch": 2.7403638213967687, + "grad_norm": 1.6215574741363525, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8731791973114014, + "num_tokens": 821964519.0, + "step": 21542 + }, + { + "epoch": 2.7404910316753592, + "grad_norm": 1.4925943613052368, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8861717581748962, + "num_tokens": 822001057.0, + "step": 21543 + }, + { + "epoch": 2.7406182419539498, + "grad_norm": 1.6137229204177856, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8751183748245239, + "num_tokens": 822039307.0, + "step": 21544 + }, + { + "epoch": 2.7407454522325403, + "grad_norm": 1.6818578243255615, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8933919668197632, + "num_tokens": 822073489.0, + "step": 21545 + }, + { + "epoch": 2.740872662511131, + "grad_norm": 1.6321321725845337, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.881304919719696, + "num_tokens": 822110224.0, + "step": 21546 + }, + { + "epoch": 2.7409998727897213, + "grad_norm": 1.5481994152069092, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8806687593460083, + "num_tokens": 822148875.0, + "step": 21547 + }, + { + "epoch": 2.741127083068312, + "grad_norm": 1.6068023443222046, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8835972547531128, + "num_tokens": 822186758.0, + "step": 21548 + }, + { + "epoch": 2.7412542933469024, + "grad_norm": 1.4499090909957886, + "learning_rate": 1e-06, + "loss": 0.2681, + "mean_token_accuracy": 0.9006160497665405, + "num_tokens": 822223329.0, + "step": 21549 + }, + { + "epoch": 2.741381503625493, + "grad_norm": 1.562111496925354, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8875706195831299, + "num_tokens": 822259460.0, + "step": 21550 + }, + { + "epoch": 2.7415087139040835, + "grad_norm": 1.6480116844177246, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.887105405330658, + "num_tokens": 822291429.0, + "step": 21551 + }, + { + "epoch": 2.741635924182674, + "grad_norm": 1.6141201257705688, + "learning_rate": 1e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.8956880569458008, + "num_tokens": 822325506.0, + "step": 21552 + }, + { + "epoch": 2.7417631344612645, + "grad_norm": 1.7742947340011597, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8871610164642334, + "num_tokens": 822356868.0, + "step": 21553 + }, + { + "epoch": 2.741890344739855, + "grad_norm": 1.6127949953079224, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8774897456169128, + "num_tokens": 822391184.0, + "step": 21554 + }, + { + "epoch": 2.7420175550184456, + "grad_norm": 1.5740658044815063, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8918350338935852, + "num_tokens": 822427040.0, + "step": 21555 + }, + { + "epoch": 2.742144765297036, + "grad_norm": 1.643395185470581, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8867030143737793, + "num_tokens": 822460750.0, + "step": 21556 + }, + { + "epoch": 2.7422719755756266, + "grad_norm": 1.6912319660186768, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.9011736512184143, + "num_tokens": 822490585.0, + "step": 21557 + }, + { + "epoch": 2.742399185854217, + "grad_norm": 1.5478321313858032, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8777498006820679, + "num_tokens": 822529048.0, + "step": 21558 + }, + { + "epoch": 2.7425263961328077, + "grad_norm": 1.6801685094833374, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8786558508872986, + "num_tokens": 822563915.0, + "step": 21559 + }, + { + "epoch": 2.742653606411398, + "grad_norm": 1.6022415161132812, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8853064179420471, + "num_tokens": 822600793.0, + "step": 21560 + }, + { + "epoch": 2.7427808166899883, + "grad_norm": 1.5797444581985474, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8894495964050293, + "num_tokens": 822633586.0, + "step": 21561 + }, + { + "epoch": 2.7429080269685793, + "grad_norm": 1.4486289024353027, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8890469670295715, + "num_tokens": 822671634.0, + "step": 21562 + }, + { + "epoch": 2.7430352372471694, + "grad_norm": 1.578770399093628, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8623977899551392, + "num_tokens": 822711438.0, + "step": 21563 + }, + { + "epoch": 2.7431624475257603, + "grad_norm": 1.7358818054199219, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8842672109603882, + "num_tokens": 822743319.0, + "step": 21564 + }, + { + "epoch": 2.7432896578043504, + "grad_norm": 1.6470195055007935, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8831250667572021, + "num_tokens": 822778130.0, + "step": 21565 + }, + { + "epoch": 2.743416868082941, + "grad_norm": 1.6547926664352417, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8797602653503418, + "num_tokens": 822813281.0, + "step": 21566 + }, + { + "epoch": 2.7435440783615315, + "grad_norm": 1.6338633298873901, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8936381340026855, + "num_tokens": 822846421.0, + "step": 21567 + }, + { + "epoch": 2.743671288640122, + "grad_norm": 1.5896646976470947, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8932840824127197, + "num_tokens": 822881905.0, + "step": 21568 + }, + { + "epoch": 2.7437984989187125, + "grad_norm": 1.6163699626922607, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8907583951950073, + "num_tokens": 822914659.0, + "step": 21569 + }, + { + "epoch": 2.743925709197303, + "grad_norm": 1.6475902795791626, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8864997029304504, + "num_tokens": 822949476.0, + "step": 21570 + }, + { + "epoch": 2.7440529194758936, + "grad_norm": 1.5952754020690918, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8880690336227417, + "num_tokens": 822986418.0, + "step": 21571 + }, + { + "epoch": 2.744180129754484, + "grad_norm": 1.5315860509872437, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8906629681587219, + "num_tokens": 823025532.0, + "step": 21572 + }, + { + "epoch": 2.7443073400330746, + "grad_norm": 1.6562858819961548, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8862687349319458, + "num_tokens": 823062874.0, + "step": 21573 + }, + { + "epoch": 2.744434550311665, + "grad_norm": 1.5048097372055054, + "learning_rate": 1e-06, + "loss": 0.2798, + "mean_token_accuracy": 0.9008004069328308, + "num_tokens": 823100977.0, + "step": 21574 + }, + { + "epoch": 2.7445617605902557, + "grad_norm": 1.6572115421295166, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8741099834442139, + "num_tokens": 823135951.0, + "step": 21575 + }, + { + "epoch": 2.7446889708688462, + "grad_norm": 1.645896553993225, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8809292316436768, + "num_tokens": 823168524.0, + "step": 21576 + }, + { + "epoch": 2.7448161811474368, + "grad_norm": 1.5802360773086548, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.881790041923523, + "num_tokens": 823207594.0, + "step": 21577 + }, + { + "epoch": 2.7449433914260273, + "grad_norm": 1.6567453145980835, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8835855722427368, + "num_tokens": 823243520.0, + "step": 21578 + }, + { + "epoch": 2.745070601704618, + "grad_norm": 1.5111464262008667, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8820551037788391, + "num_tokens": 823282422.0, + "step": 21579 + }, + { + "epoch": 2.7451978119832083, + "grad_norm": 1.585295557975769, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8844959735870361, + "num_tokens": 823319654.0, + "step": 21580 + }, + { + "epoch": 2.745325022261799, + "grad_norm": 1.609411597251892, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8878750205039978, + "num_tokens": 823353790.0, + "step": 21581 + }, + { + "epoch": 2.7454522325403894, + "grad_norm": 1.590509057044983, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8833760619163513, + "num_tokens": 823389057.0, + "step": 21582 + }, + { + "epoch": 2.74557944281898, + "grad_norm": 1.5484195947647095, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.884414792060852, + "num_tokens": 823430285.0, + "step": 21583 + }, + { + "epoch": 2.7457066530975704, + "grad_norm": 1.6600736379623413, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8899632692337036, + "num_tokens": 823463216.0, + "step": 21584 + }, + { + "epoch": 2.745833863376161, + "grad_norm": 1.644738793373108, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8753438591957092, + "num_tokens": 823503402.0, + "step": 21585 + }, + { + "epoch": 2.745961073654751, + "grad_norm": 1.5220921039581299, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8924165368080139, + "num_tokens": 823544129.0, + "step": 21586 + }, + { + "epoch": 2.746088283933342, + "grad_norm": 1.638568639755249, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.887047290802002, + "num_tokens": 823581836.0, + "step": 21587 + }, + { + "epoch": 2.746215494211932, + "grad_norm": 1.6220885515213013, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.887656569480896, + "num_tokens": 823617343.0, + "step": 21588 + }, + { + "epoch": 2.746342704490523, + "grad_norm": 1.5173757076263428, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8879035115242004, + "num_tokens": 823658287.0, + "step": 21589 + }, + { + "epoch": 2.746469914769113, + "grad_norm": 1.4985435009002686, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8860131502151489, + "num_tokens": 823696282.0, + "step": 21590 + }, + { + "epoch": 2.7465971250477037, + "grad_norm": 1.594552755355835, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8810195326805115, + "num_tokens": 823733904.0, + "step": 21591 + }, + { + "epoch": 2.7467243353262942, + "grad_norm": 1.6257919073104858, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8838156461715698, + "num_tokens": 823775484.0, + "step": 21592 + }, + { + "epoch": 2.7468515456048848, + "grad_norm": 1.6054565906524658, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.89552241563797, + "num_tokens": 823811524.0, + "step": 21593 + }, + { + "epoch": 2.7469787558834753, + "grad_norm": 1.6077814102172852, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8816936016082764, + "num_tokens": 823850054.0, + "step": 21594 + }, + { + "epoch": 2.747105966162066, + "grad_norm": 1.5264568328857422, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8910112380981445, + "num_tokens": 823889214.0, + "step": 21595 + }, + { + "epoch": 2.7472331764406563, + "grad_norm": 1.5403567552566528, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8789070248603821, + "num_tokens": 823927947.0, + "step": 21596 + }, + { + "epoch": 2.747360386719247, + "grad_norm": 1.5918779373168945, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8870139122009277, + "num_tokens": 823963771.0, + "step": 21597 + }, + { + "epoch": 2.7474875969978374, + "grad_norm": 1.3225589990615845, + "learning_rate": 1e-06, + "loss": 0.2678, + "mean_token_accuracy": 0.9032102227210999, + "num_tokens": 824008418.0, + "step": 21598 + }, + { + "epoch": 2.747614807276428, + "grad_norm": 1.5759824514389038, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8886661529541016, + "num_tokens": 824041736.0, + "step": 21599 + }, + { + "epoch": 2.7477420175550185, + "grad_norm": 1.4256021976470947, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8863668441772461, + "num_tokens": 824086948.0, + "step": 21600 + }, + { + "epoch": 2.747869227833609, + "grad_norm": 1.5356446504592896, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.8947370052337646, + "num_tokens": 824123170.0, + "step": 21601 + }, + { + "epoch": 2.7479964381121995, + "grad_norm": 1.5899943113327026, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.892109215259552, + "num_tokens": 824158989.0, + "step": 21602 + }, + { + "epoch": 2.74812364839079, + "grad_norm": 1.5513063669204712, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8919827938079834, + "num_tokens": 824197723.0, + "step": 21603 + }, + { + "epoch": 2.7482508586693806, + "grad_norm": 1.6075438261032104, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.892462968826294, + "num_tokens": 824232460.0, + "step": 21604 + }, + { + "epoch": 2.748378068947971, + "grad_norm": 1.457690954208374, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8804033398628235, + "num_tokens": 824278638.0, + "step": 21605 + }, + { + "epoch": 2.7485052792265616, + "grad_norm": 1.578637957572937, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8942928314208984, + "num_tokens": 824312288.0, + "step": 21606 + }, + { + "epoch": 2.748632489505152, + "grad_norm": 1.5617517232894897, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8832864761352539, + "num_tokens": 824350112.0, + "step": 21607 + }, + { + "epoch": 2.7487596997837427, + "grad_norm": 1.6599310636520386, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8869572877883911, + "num_tokens": 824384587.0, + "step": 21608 + }, + { + "epoch": 2.7488869100623328, + "grad_norm": 1.5172070264816284, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8822405338287354, + "num_tokens": 824428726.0, + "step": 21609 + }, + { + "epoch": 2.7490141203409237, + "grad_norm": 1.4669162034988403, + "learning_rate": 1e-06, + "loss": 0.2644, + "mean_token_accuracy": 0.9013568758964539, + "num_tokens": 824467960.0, + "step": 21610 + }, + { + "epoch": 2.749141330619514, + "grad_norm": 1.5047556161880493, + "learning_rate": 1e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.8947634696960449, + "num_tokens": 824504416.0, + "step": 21611 + }, + { + "epoch": 2.749268540898105, + "grad_norm": 1.7225369215011597, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8782330751419067, + "num_tokens": 824538068.0, + "step": 21612 + }, + { + "epoch": 2.749395751176695, + "grad_norm": 1.5034959316253662, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8911190032958984, + "num_tokens": 824575809.0, + "step": 21613 + }, + { + "epoch": 2.749522961455286, + "grad_norm": 1.4866975545883179, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8924674987792969, + "num_tokens": 824615423.0, + "step": 21614 + }, + { + "epoch": 2.749650171733876, + "grad_norm": 1.5982747077941895, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8881930708885193, + "num_tokens": 824651278.0, + "step": 21615 + }, + { + "epoch": 2.7497773820124665, + "grad_norm": 1.550096869468689, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8881344795227051, + "num_tokens": 824689674.0, + "step": 21616 + }, + { + "epoch": 2.749904592291057, + "grad_norm": 1.5550041198730469, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8799211978912354, + "num_tokens": 824727711.0, + "step": 21617 + }, + { + "epoch": 2.7500318025696475, + "grad_norm": 1.4186508655548096, + "learning_rate": 1e-06, + "loss": 0.2846, + "mean_token_accuracy": 0.8966490030288696, + "num_tokens": 824766959.0, + "step": 21618 + }, + { + "epoch": 2.750159012848238, + "grad_norm": 1.392044186592102, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8905935287475586, + "num_tokens": 824812140.0, + "step": 21619 + }, + { + "epoch": 2.7502862231268286, + "grad_norm": 1.4729729890823364, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8935527801513672, + "num_tokens": 824850879.0, + "step": 21620 + }, + { + "epoch": 2.750413433405419, + "grad_norm": 1.5856302976608276, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8920689225196838, + "num_tokens": 824887224.0, + "step": 21621 + }, + { + "epoch": 2.7505406436840096, + "grad_norm": 1.6145700216293335, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8872272968292236, + "num_tokens": 824925284.0, + "step": 21622 + }, + { + "epoch": 2.7506678539626, + "grad_norm": 1.4696699380874634, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8796219229698181, + "num_tokens": 824969193.0, + "step": 21623 + }, + { + "epoch": 2.7507950642411907, + "grad_norm": 1.6905163526535034, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8656740188598633, + "num_tokens": 825008053.0, + "step": 21624 + }, + { + "epoch": 2.750922274519781, + "grad_norm": 1.6152414083480835, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8788337707519531, + "num_tokens": 825046534.0, + "step": 21625 + }, + { + "epoch": 2.7510494847983717, + "grad_norm": 1.6528820991516113, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8867285847663879, + "num_tokens": 825081025.0, + "step": 21626 + }, + { + "epoch": 2.7511766950769623, + "grad_norm": 1.6279257535934448, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8742384314537048, + "num_tokens": 825117084.0, + "step": 21627 + }, + { + "epoch": 2.751303905355553, + "grad_norm": 1.5803899765014648, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8901562094688416, + "num_tokens": 825157235.0, + "step": 21628 + }, + { + "epoch": 2.7514311156341433, + "grad_norm": 1.5140057802200317, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8785711526870728, + "num_tokens": 825198555.0, + "step": 21629 + }, + { + "epoch": 2.751558325912734, + "grad_norm": 1.6635996103286743, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8852212429046631, + "num_tokens": 825238411.0, + "step": 21630 + }, + { + "epoch": 2.7516855361913244, + "grad_norm": 1.5388273000717163, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8814501762390137, + "num_tokens": 825280674.0, + "step": 21631 + }, + { + "epoch": 2.751812746469915, + "grad_norm": 1.616013526916504, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8763099908828735, + "num_tokens": 825319062.0, + "step": 21632 + }, + { + "epoch": 2.7519399567485054, + "grad_norm": 1.4186547994613647, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8897709250450134, + "num_tokens": 825360358.0, + "step": 21633 + }, + { + "epoch": 2.7520671670270955, + "grad_norm": 1.5871214866638184, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8778519630432129, + "num_tokens": 825403727.0, + "step": 21634 + }, + { + "epoch": 2.7521943773056865, + "grad_norm": 1.5902209281921387, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8782463073730469, + "num_tokens": 825441798.0, + "step": 21635 + }, + { + "epoch": 2.7523215875842766, + "grad_norm": 1.5132040977478027, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8938729763031006, + "num_tokens": 825481455.0, + "step": 21636 + }, + { + "epoch": 2.7524487978628676, + "grad_norm": 1.5775272846221924, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8907660245895386, + "num_tokens": 825519125.0, + "step": 21637 + }, + { + "epoch": 2.7525760081414576, + "grad_norm": 1.5970438718795776, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.892555832862854, + "num_tokens": 825555472.0, + "step": 21638 + }, + { + "epoch": 2.7527032184200486, + "grad_norm": 1.5439165830612183, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8957713842391968, + "num_tokens": 825592107.0, + "step": 21639 + }, + { + "epoch": 2.7528304286986387, + "grad_norm": 1.4975053071975708, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8766300678253174, + "num_tokens": 825635689.0, + "step": 21640 + }, + { + "epoch": 2.7529576389772292, + "grad_norm": 1.5038928985595703, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8958237767219543, + "num_tokens": 825673261.0, + "step": 21641 + }, + { + "epoch": 2.7530848492558198, + "grad_norm": 1.4915138483047485, + "learning_rate": 1e-06, + "loss": 0.2836, + "mean_token_accuracy": 0.8981702923774719, + "num_tokens": 825715368.0, + "step": 21642 + }, + { + "epoch": 2.7532120595344103, + "grad_norm": 1.4775688648223877, + "learning_rate": 1e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.8962714672088623, + "num_tokens": 825756511.0, + "step": 21643 + }, + { + "epoch": 2.753339269813001, + "grad_norm": 1.5971812009811401, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8772550821304321, + "num_tokens": 825793768.0, + "step": 21644 + }, + { + "epoch": 2.7534664800915913, + "grad_norm": 1.6447008848190308, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8709433674812317, + "num_tokens": 825833587.0, + "step": 21645 + }, + { + "epoch": 2.753593690370182, + "grad_norm": 1.5083377361297607, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8780016899108887, + "num_tokens": 825873916.0, + "step": 21646 + }, + { + "epoch": 2.7537209006487724, + "grad_norm": 1.4413633346557617, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8913978338241577, + "num_tokens": 825916373.0, + "step": 21647 + }, + { + "epoch": 2.753848110927363, + "grad_norm": 1.5106087923049927, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8755664825439453, + "num_tokens": 825955040.0, + "step": 21648 + }, + { + "epoch": 2.7539753212059535, + "grad_norm": 1.6396843194961548, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8834269046783447, + "num_tokens": 825988254.0, + "step": 21649 + }, + { + "epoch": 2.754102531484544, + "grad_norm": 1.629682183265686, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8883672952651978, + "num_tokens": 826025721.0, + "step": 21650 + }, + { + "epoch": 2.7542297417631345, + "grad_norm": 1.4899569749832153, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8927914500236511, + "num_tokens": 826065868.0, + "step": 21651 + }, + { + "epoch": 2.754356952041725, + "grad_norm": 1.5447208881378174, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8891830444335938, + "num_tokens": 826104977.0, + "step": 21652 + }, + { + "epoch": 2.7544841623203156, + "grad_norm": 1.7778671979904175, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8869348168373108, + "num_tokens": 826133824.0, + "step": 21653 + }, + { + "epoch": 2.754611372598906, + "grad_norm": 1.5546410083770752, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8847754597663879, + "num_tokens": 826172454.0, + "step": 21654 + }, + { + "epoch": 2.7547385828774966, + "grad_norm": 1.5286177396774292, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8808683753013611, + "num_tokens": 826217530.0, + "step": 21655 + }, + { + "epoch": 2.754865793156087, + "grad_norm": 1.565665602684021, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8817355632781982, + "num_tokens": 826257290.0, + "step": 21656 + }, + { + "epoch": 2.7549930034346777, + "grad_norm": 1.6073945760726929, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8716275691986084, + "num_tokens": 826297166.0, + "step": 21657 + }, + { + "epoch": 2.755120213713268, + "grad_norm": 1.4691418409347534, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8920220136642456, + "num_tokens": 826341053.0, + "step": 21658 + }, + { + "epoch": 2.7552474239918583, + "grad_norm": 1.6479296684265137, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8769299983978271, + "num_tokens": 826380368.0, + "step": 21659 + }, + { + "epoch": 2.7553746342704493, + "grad_norm": 1.391202688217163, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8952062129974365, + "num_tokens": 826421405.0, + "step": 21660 + }, + { + "epoch": 2.7555018445490393, + "grad_norm": 1.5941487550735474, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.89605712890625, + "num_tokens": 826458352.0, + "step": 21661 + }, + { + "epoch": 2.7556290548276303, + "grad_norm": 1.560564637184143, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8894508481025696, + "num_tokens": 826495697.0, + "step": 21662 + }, + { + "epoch": 2.7557562651062204, + "grad_norm": 1.4756454229354858, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8903447985649109, + "num_tokens": 826536474.0, + "step": 21663 + }, + { + "epoch": 2.755883475384811, + "grad_norm": 1.434497356414795, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8930231928825378, + "num_tokens": 826577042.0, + "step": 21664 + }, + { + "epoch": 2.7560106856634015, + "grad_norm": 1.5210225582122803, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8782334327697754, + "num_tokens": 826618238.0, + "step": 21665 + }, + { + "epoch": 2.756137895941992, + "grad_norm": 1.5805424451828003, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8882410526275635, + "num_tokens": 826655374.0, + "step": 21666 + }, + { + "epoch": 2.7562651062205825, + "grad_norm": 1.484499454498291, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8874439597129822, + "num_tokens": 826694894.0, + "step": 21667 + }, + { + "epoch": 2.756392316499173, + "grad_norm": 1.5660715103149414, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8834999799728394, + "num_tokens": 826730977.0, + "step": 21668 + }, + { + "epoch": 2.7565195267777636, + "grad_norm": 1.5527747869491577, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.888849139213562, + "num_tokens": 826769105.0, + "step": 21669 + }, + { + "epoch": 2.756646737056354, + "grad_norm": 1.5810586214065552, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8877671957015991, + "num_tokens": 826810170.0, + "step": 21670 + }, + { + "epoch": 2.7567739473349446, + "grad_norm": 1.4797874689102173, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8871191740036011, + "num_tokens": 826847239.0, + "step": 21671 + }, + { + "epoch": 2.756901157613535, + "grad_norm": 1.7309240102767944, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8842445611953735, + "num_tokens": 826881591.0, + "step": 21672 + }, + { + "epoch": 2.7570283678921257, + "grad_norm": 1.6076492071151733, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8796573877334595, + "num_tokens": 826916129.0, + "step": 21673 + }, + { + "epoch": 2.757155578170716, + "grad_norm": 1.5228798389434814, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8933402895927429, + "num_tokens": 826952386.0, + "step": 21674 + }, + { + "epoch": 2.7572827884493067, + "grad_norm": 1.4870102405548096, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8892769813537598, + "num_tokens": 826992343.0, + "step": 21675 + }, + { + "epoch": 2.7574099987278973, + "grad_norm": 1.4649626016616821, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8920682668685913, + "num_tokens": 827033992.0, + "step": 21676 + }, + { + "epoch": 2.757537209006488, + "grad_norm": 1.5530146360397339, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8896274566650391, + "num_tokens": 827071228.0, + "step": 21677 + }, + { + "epoch": 2.7576644192850783, + "grad_norm": 1.416550636291504, + "learning_rate": 1e-06, + "loss": 0.28, + "mean_token_accuracy": 0.8983860015869141, + "num_tokens": 827110258.0, + "step": 21678 + }, + { + "epoch": 2.757791629563669, + "grad_norm": 1.633816123008728, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8925184011459351, + "num_tokens": 827143949.0, + "step": 21679 + }, + { + "epoch": 2.7579188398422594, + "grad_norm": 1.5528056621551514, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8933451771736145, + "num_tokens": 827178445.0, + "step": 21680 + }, + { + "epoch": 2.75804605012085, + "grad_norm": 1.633001685142517, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8949087858200073, + "num_tokens": 827216933.0, + "step": 21681 + }, + { + "epoch": 2.7581732603994404, + "grad_norm": 1.7443621158599854, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8759362101554871, + "num_tokens": 827251581.0, + "step": 21682 + }, + { + "epoch": 2.758300470678031, + "grad_norm": 1.6631020307540894, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8849169015884399, + "num_tokens": 827284627.0, + "step": 21683 + }, + { + "epoch": 2.758427680956621, + "grad_norm": 1.6488338708877563, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8618295192718506, + "num_tokens": 827323080.0, + "step": 21684 + }, + { + "epoch": 2.758554891235212, + "grad_norm": 1.5366945266723633, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8766266107559204, + "num_tokens": 827363311.0, + "step": 21685 + }, + { + "epoch": 2.758682101513802, + "grad_norm": 1.502652645111084, + "learning_rate": 1e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.8958180546760559, + "num_tokens": 827400546.0, + "step": 21686 + }, + { + "epoch": 2.758809311792393, + "grad_norm": 1.5324515104293823, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8886433839797974, + "num_tokens": 827436285.0, + "step": 21687 + }, + { + "epoch": 2.758936522070983, + "grad_norm": 1.5984604358673096, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.8964104652404785, + "num_tokens": 827471462.0, + "step": 21688 + }, + { + "epoch": 2.7590637323495737, + "grad_norm": 1.5421440601348877, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8768171072006226, + "num_tokens": 827515111.0, + "step": 21689 + }, + { + "epoch": 2.7591909426281642, + "grad_norm": 1.533273696899414, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8863343000411987, + "num_tokens": 827553624.0, + "step": 21690 + }, + { + "epoch": 2.7593181529067548, + "grad_norm": 1.6055353879928589, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.8925430178642273, + "num_tokens": 827589697.0, + "step": 21691 + }, + { + "epoch": 2.7594453631853453, + "grad_norm": 1.5356820821762085, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.888335108757019, + "num_tokens": 827631805.0, + "step": 21692 + }, + { + "epoch": 2.759572573463936, + "grad_norm": 1.6134884357452393, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8778318166732788, + "num_tokens": 827669939.0, + "step": 21693 + }, + { + "epoch": 2.7596997837425263, + "grad_norm": 1.6738988161087036, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8681057095527649, + "num_tokens": 827710114.0, + "step": 21694 + }, + { + "epoch": 2.759826994021117, + "grad_norm": 1.5914382934570312, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8917929530143738, + "num_tokens": 827748286.0, + "step": 21695 + }, + { + "epoch": 2.7599542042997074, + "grad_norm": 1.6562165021896362, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8893206119537354, + "num_tokens": 827782587.0, + "step": 21696 + }, + { + "epoch": 2.760081414578298, + "grad_norm": 1.7156367301940918, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.875802755355835, + "num_tokens": 827825379.0, + "step": 21697 + }, + { + "epoch": 2.7602086248568884, + "grad_norm": 1.557428240776062, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.8994508981704712, + "num_tokens": 827861614.0, + "step": 21698 + }, + { + "epoch": 2.760335835135479, + "grad_norm": 1.5871081352233887, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8792316913604736, + "num_tokens": 827902218.0, + "step": 21699 + }, + { + "epoch": 2.7604630454140695, + "grad_norm": 1.6263281106948853, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8826563358306885, + "num_tokens": 827937837.0, + "step": 21700 + }, + { + "epoch": 2.76059025569266, + "grad_norm": 1.6206920146942139, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8866761922836304, + "num_tokens": 827972063.0, + "step": 21701 + }, + { + "epoch": 2.7607174659712506, + "grad_norm": 1.6535797119140625, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8947622776031494, + "num_tokens": 828013286.0, + "step": 21702 + }, + { + "epoch": 2.760844676249841, + "grad_norm": 1.640979528427124, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8894907236099243, + "num_tokens": 828047606.0, + "step": 21703 + }, + { + "epoch": 2.7609718865284316, + "grad_norm": 1.6277167797088623, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8816895484924316, + "num_tokens": 828084814.0, + "step": 21704 + }, + { + "epoch": 2.761099096807022, + "grad_norm": 1.447759747505188, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8913546800613403, + "num_tokens": 828124693.0, + "step": 21705 + }, + { + "epoch": 2.7612263070856127, + "grad_norm": 1.5377638339996338, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8729120492935181, + "num_tokens": 828165418.0, + "step": 21706 + }, + { + "epoch": 2.7613535173642028, + "grad_norm": 1.580592393875122, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.893136739730835, + "num_tokens": 828204633.0, + "step": 21707 + }, + { + "epoch": 2.7614807276427937, + "grad_norm": 1.4265388250350952, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8944356441497803, + "num_tokens": 828244880.0, + "step": 21708 + }, + { + "epoch": 2.761607937921384, + "grad_norm": 1.5590178966522217, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8808504939079285, + "num_tokens": 828283551.0, + "step": 21709 + }, + { + "epoch": 2.761735148199975, + "grad_norm": 1.6184765100479126, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.883359968662262, + "num_tokens": 828320466.0, + "step": 21710 + }, + { + "epoch": 2.761862358478565, + "grad_norm": 1.6032086610794067, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8778464198112488, + "num_tokens": 828359021.0, + "step": 21711 + }, + { + "epoch": 2.761989568757156, + "grad_norm": 1.4807578325271606, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8868111968040466, + "num_tokens": 828399289.0, + "step": 21712 + }, + { + "epoch": 2.762116779035746, + "grad_norm": 1.6773630380630493, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8764656186103821, + "num_tokens": 828435460.0, + "step": 21713 + }, + { + "epoch": 2.7622439893143365, + "grad_norm": 1.6731430292129517, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8736934065818787, + "num_tokens": 828472730.0, + "step": 21714 + }, + { + "epoch": 2.762371199592927, + "grad_norm": 1.4602985382080078, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8928992748260498, + "num_tokens": 828513688.0, + "step": 21715 + }, + { + "epoch": 2.7624984098715175, + "grad_norm": 1.5462831258773804, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8774127960205078, + "num_tokens": 828553482.0, + "step": 21716 + }, + { + "epoch": 2.762625620150108, + "grad_norm": 1.4215774536132812, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8881669640541077, + "num_tokens": 828594122.0, + "step": 21717 + }, + { + "epoch": 2.7627528304286986, + "grad_norm": 1.501365065574646, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8884017467498779, + "num_tokens": 828632846.0, + "step": 21718 + }, + { + "epoch": 2.762880040707289, + "grad_norm": 1.5416697263717651, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8722460269927979, + "num_tokens": 828671880.0, + "step": 21719 + }, + { + "epoch": 2.7630072509858796, + "grad_norm": 1.4854326248168945, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8928548693656921, + "num_tokens": 828711759.0, + "step": 21720 + }, + { + "epoch": 2.76313446126447, + "grad_norm": 1.5336014032363892, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8790528774261475, + "num_tokens": 828752126.0, + "step": 21721 + }, + { + "epoch": 2.7632616715430607, + "grad_norm": 1.605827808380127, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8841488361358643, + "num_tokens": 828790633.0, + "step": 21722 + }, + { + "epoch": 2.763388881821651, + "grad_norm": 1.6407212018966675, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8653643131256104, + "num_tokens": 828831553.0, + "step": 21723 + }, + { + "epoch": 2.7635160921002417, + "grad_norm": 1.675894856452942, + "learning_rate": 1e-06, + "loss": 0.2827, + "mean_token_accuracy": 0.8981973528862, + "num_tokens": 828865522.0, + "step": 21724 + }, + { + "epoch": 2.7636433023788323, + "grad_norm": 1.5799081325531006, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8935983180999756, + "num_tokens": 828900142.0, + "step": 21725 + }, + { + "epoch": 2.763770512657423, + "grad_norm": 1.4438347816467285, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.89574134349823, + "num_tokens": 828943183.0, + "step": 21726 + }, + { + "epoch": 2.7638977229360133, + "grad_norm": 1.6096985340118408, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8907372951507568, + "num_tokens": 828981113.0, + "step": 21727 + }, + { + "epoch": 2.764024933214604, + "grad_norm": 1.5671285390853882, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8821313381195068, + "num_tokens": 829019773.0, + "step": 21728 + }, + { + "epoch": 2.7641521434931944, + "grad_norm": 1.5575734376907349, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8902705907821655, + "num_tokens": 829056449.0, + "step": 21729 + }, + { + "epoch": 2.764279353771785, + "grad_norm": 1.624265193939209, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8708168864250183, + "num_tokens": 829094531.0, + "step": 21730 + }, + { + "epoch": 2.7644065640503754, + "grad_norm": 1.572184681892395, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.880602240562439, + "num_tokens": 829134442.0, + "step": 21731 + }, + { + "epoch": 2.7645337743289655, + "grad_norm": 1.6278257369995117, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8847764134407043, + "num_tokens": 829174114.0, + "step": 21732 + }, + { + "epoch": 2.7646609846075565, + "grad_norm": 1.4335083961486816, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8853347897529602, + "num_tokens": 829217718.0, + "step": 21733 + }, + { + "epoch": 2.7647881948861466, + "grad_norm": 1.5763624906539917, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.887688398361206, + "num_tokens": 829254917.0, + "step": 21734 + }, + { + "epoch": 2.7649154051647375, + "grad_norm": 1.6471155881881714, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8807974457740784, + "num_tokens": 829293343.0, + "step": 21735 + }, + { + "epoch": 2.7650426154433276, + "grad_norm": 1.428947925567627, + "learning_rate": 1e-06, + "loss": 0.2825, + "mean_token_accuracy": 0.8980706930160522, + "num_tokens": 829336581.0, + "step": 21736 + }, + { + "epoch": 2.7651698257219186, + "grad_norm": 1.584509015083313, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8793272972106934, + "num_tokens": 829374062.0, + "step": 21737 + }, + { + "epoch": 2.7652970360005087, + "grad_norm": 1.5467053651809692, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8746386170387268, + "num_tokens": 829414539.0, + "step": 21738 + }, + { + "epoch": 2.765424246279099, + "grad_norm": 1.4968862533569336, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8844048976898193, + "num_tokens": 829454121.0, + "step": 21739 + }, + { + "epoch": 2.7655514565576897, + "grad_norm": 1.4893333911895752, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8812466263771057, + "num_tokens": 829495213.0, + "step": 21740 + }, + { + "epoch": 2.7656786668362803, + "grad_norm": 1.6965101957321167, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8822488784790039, + "num_tokens": 829528690.0, + "step": 21741 + }, + { + "epoch": 2.765805877114871, + "grad_norm": 1.4351122379302979, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8778266906738281, + "num_tokens": 829575709.0, + "step": 21742 + }, + { + "epoch": 2.7659330873934613, + "grad_norm": 1.526528239250183, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8823356628417969, + "num_tokens": 829617030.0, + "step": 21743 + }, + { + "epoch": 2.766060297672052, + "grad_norm": 1.5331732034683228, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8898081183433533, + "num_tokens": 829654918.0, + "step": 21744 + }, + { + "epoch": 2.7661875079506424, + "grad_norm": 1.5153968334197998, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8878026008605957, + "num_tokens": 829696645.0, + "step": 21745 + }, + { + "epoch": 2.766314718229233, + "grad_norm": 1.5882256031036377, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8907001614570618, + "num_tokens": 829735658.0, + "step": 21746 + }, + { + "epoch": 2.7664419285078234, + "grad_norm": 1.6009589433670044, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8879010677337646, + "num_tokens": 829772385.0, + "step": 21747 + }, + { + "epoch": 2.766569138786414, + "grad_norm": 1.5091770887374878, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8866770267486572, + "num_tokens": 829811355.0, + "step": 21748 + }, + { + "epoch": 2.7666963490650045, + "grad_norm": 1.7094473838806152, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8970462083816528, + "num_tokens": 829842074.0, + "step": 21749 + }, + { + "epoch": 2.766823559343595, + "grad_norm": 1.5615438222885132, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8856905698776245, + "num_tokens": 829877715.0, + "step": 21750 + }, + { + "epoch": 2.7669507696221856, + "grad_norm": 1.538382649421692, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8684543371200562, + "num_tokens": 829920950.0, + "step": 21751 + }, + { + "epoch": 2.767077979900776, + "grad_norm": 1.5398964881896973, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8869648575782776, + "num_tokens": 829960710.0, + "step": 21752 + }, + { + "epoch": 2.7672051901793666, + "grad_norm": 1.7014471292495728, + "learning_rate": 1e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.897452712059021, + "num_tokens": 829995538.0, + "step": 21753 + }, + { + "epoch": 2.767332400457957, + "grad_norm": 1.51266348361969, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8834505081176758, + "num_tokens": 830037232.0, + "step": 21754 + }, + { + "epoch": 2.7674596107365477, + "grad_norm": 1.7025610208511353, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8835474252700806, + "num_tokens": 830073683.0, + "step": 21755 + }, + { + "epoch": 2.767586821015138, + "grad_norm": 1.5830841064453125, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.8984690308570862, + "num_tokens": 830107937.0, + "step": 21756 + }, + { + "epoch": 2.7677140312937283, + "grad_norm": 1.640466332435608, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8810413479804993, + "num_tokens": 830144044.0, + "step": 21757 + }, + { + "epoch": 2.7678412415723193, + "grad_norm": 1.4674839973449707, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.8956587910652161, + "num_tokens": 830185408.0, + "step": 21758 + }, + { + "epoch": 2.7679684518509093, + "grad_norm": 1.5754585266113281, + "learning_rate": 1e-06, + "loss": 0.2684, + "mean_token_accuracy": 0.9002116918563843, + "num_tokens": 830221728.0, + "step": 21759 + }, + { + "epoch": 2.7680956621295003, + "grad_norm": 1.5331600904464722, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.889218807220459, + "num_tokens": 830263993.0, + "step": 21760 + }, + { + "epoch": 2.7682228724080904, + "grad_norm": 1.5010735988616943, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8794453740119934, + "num_tokens": 830305862.0, + "step": 21761 + }, + { + "epoch": 2.768350082686681, + "grad_norm": 1.5982757806777954, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8864859342575073, + "num_tokens": 830344342.0, + "step": 21762 + }, + { + "epoch": 2.7684772929652715, + "grad_norm": 1.5832000970840454, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8834452033042908, + "num_tokens": 830382988.0, + "step": 21763 + }, + { + "epoch": 2.768604503243862, + "grad_norm": 1.4882235527038574, + "learning_rate": 1e-06, + "loss": 0.2723, + "mean_token_accuracy": 0.900100827217102, + "num_tokens": 830420762.0, + "step": 21764 + }, + { + "epoch": 2.7687317135224525, + "grad_norm": 1.405876636505127, + "learning_rate": 1e-06, + "loss": 0.2623, + "mean_token_accuracy": 0.9044294953346252, + "num_tokens": 830460053.0, + "step": 21765 + }, + { + "epoch": 2.768858923801043, + "grad_norm": 1.4312705993652344, + "learning_rate": 1e-06, + "loss": 0.2838, + "mean_token_accuracy": 0.8985105752944946, + "num_tokens": 830504238.0, + "step": 21766 + }, + { + "epoch": 2.7689861340796336, + "grad_norm": 1.4808075428009033, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8953890800476074, + "num_tokens": 830542998.0, + "step": 21767 + }, + { + "epoch": 2.769113344358224, + "grad_norm": 1.5284112691879272, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8823126554489136, + "num_tokens": 830582808.0, + "step": 21768 + }, + { + "epoch": 2.7692405546368146, + "grad_norm": 1.662253499031067, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8858799934387207, + "num_tokens": 830620958.0, + "step": 21769 + }, + { + "epoch": 2.769367764915405, + "grad_norm": 1.639051914215088, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.888494610786438, + "num_tokens": 830658660.0, + "step": 21770 + }, + { + "epoch": 2.7694949751939957, + "grad_norm": 1.478358268737793, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8864560127258301, + "num_tokens": 830699519.0, + "step": 21771 + }, + { + "epoch": 2.769622185472586, + "grad_norm": 1.6554455757141113, + "learning_rate": 1e-06, + "loss": 0.2809, + "mean_token_accuracy": 0.8951618671417236, + "num_tokens": 830731650.0, + "step": 21772 + }, + { + "epoch": 2.7697493957511767, + "grad_norm": 1.497024416923523, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8834008574485779, + "num_tokens": 830771761.0, + "step": 21773 + }, + { + "epoch": 2.7698766060297673, + "grad_norm": 1.5296714305877686, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8762496113777161, + "num_tokens": 830816067.0, + "step": 21774 + }, + { + "epoch": 2.770003816308358, + "grad_norm": 1.4907246828079224, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8770782947540283, + "num_tokens": 830860562.0, + "step": 21775 + }, + { + "epoch": 2.7701310265869483, + "grad_norm": 1.5563453435897827, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8876839876174927, + "num_tokens": 830896326.0, + "step": 21776 + }, + { + "epoch": 2.770258236865539, + "grad_norm": 1.5867563486099243, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8774191737174988, + "num_tokens": 830933757.0, + "step": 21777 + }, + { + "epoch": 2.7703854471441294, + "grad_norm": 1.5600199699401855, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8885392546653748, + "num_tokens": 830968957.0, + "step": 21778 + }, + { + "epoch": 2.77051265742272, + "grad_norm": 1.7672791481018066, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8651747703552246, + "num_tokens": 831006366.0, + "step": 21779 + }, + { + "epoch": 2.77063986770131, + "grad_norm": 1.6964242458343506, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8730298280715942, + "num_tokens": 831043478.0, + "step": 21780 + }, + { + "epoch": 2.770767077979901, + "grad_norm": 1.6598905324935913, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8746117353439331, + "num_tokens": 831082077.0, + "step": 21781 + }, + { + "epoch": 2.770894288258491, + "grad_norm": 1.5272514820098877, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8788942694664001, + "num_tokens": 831123573.0, + "step": 21782 + }, + { + "epoch": 2.771021498537082, + "grad_norm": 1.466874361038208, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8919676542282104, + "num_tokens": 831165987.0, + "step": 21783 + }, + { + "epoch": 2.771148708815672, + "grad_norm": 1.6130075454711914, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8873993158340454, + "num_tokens": 831199404.0, + "step": 21784 + }, + { + "epoch": 2.771275919094263, + "grad_norm": 1.7719224691390991, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8716982007026672, + "num_tokens": 831232131.0, + "step": 21785 + }, + { + "epoch": 2.771403129372853, + "grad_norm": 1.5858725309371948, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8835928440093994, + "num_tokens": 831267842.0, + "step": 21786 + }, + { + "epoch": 2.7715303396514437, + "grad_norm": 1.639267086982727, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8815966248512268, + "num_tokens": 831302333.0, + "step": 21787 + }, + { + "epoch": 2.771657549930034, + "grad_norm": 1.5016608238220215, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8807318210601807, + "num_tokens": 831342019.0, + "step": 21788 + }, + { + "epoch": 2.7717847602086247, + "grad_norm": 1.6231944561004639, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8903462886810303, + "num_tokens": 831376986.0, + "step": 21789 + }, + { + "epoch": 2.7719119704872153, + "grad_norm": 1.5438735485076904, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8929656147956848, + "num_tokens": 831418003.0, + "step": 21790 + }, + { + "epoch": 2.772039180765806, + "grad_norm": 1.4411108493804932, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8858187198638916, + "num_tokens": 831461075.0, + "step": 21791 + }, + { + "epoch": 2.7721663910443963, + "grad_norm": 1.6188446283340454, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8801649212837219, + "num_tokens": 831498512.0, + "step": 21792 + }, + { + "epoch": 2.772293601322987, + "grad_norm": 1.5529019832611084, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8803242444992065, + "num_tokens": 831539135.0, + "step": 21793 + }, + { + "epoch": 2.7724208116015774, + "grad_norm": 1.5308657884597778, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.882667064666748, + "num_tokens": 831578736.0, + "step": 21794 + }, + { + "epoch": 2.772548021880168, + "grad_norm": 1.5701297521591187, + "learning_rate": 1e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.8948285579681396, + "num_tokens": 831619599.0, + "step": 21795 + }, + { + "epoch": 2.7726752321587584, + "grad_norm": 1.738114356994629, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.875247597694397, + "num_tokens": 831656487.0, + "step": 21796 + }, + { + "epoch": 2.772802442437349, + "grad_norm": 1.5450761318206787, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.881839394569397, + "num_tokens": 831696274.0, + "step": 21797 + }, + { + "epoch": 2.7729296527159395, + "grad_norm": 1.5910065174102783, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8850440979003906, + "num_tokens": 831734230.0, + "step": 21798 + }, + { + "epoch": 2.77305686299453, + "grad_norm": 1.4973762035369873, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.8968415260314941, + "num_tokens": 831772754.0, + "step": 21799 + }, + { + "epoch": 2.7731840732731206, + "grad_norm": 1.5762139558792114, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8836456537246704, + "num_tokens": 831809692.0, + "step": 21800 + }, + { + "epoch": 2.773311283551711, + "grad_norm": 1.6134263277053833, + "learning_rate": 1e-06, + "loss": 0.2772, + "mean_token_accuracy": 0.9008678197860718, + "num_tokens": 831842078.0, + "step": 21801 + }, + { + "epoch": 2.7734384938303016, + "grad_norm": 1.6002888679504395, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8885003328323364, + "num_tokens": 831875273.0, + "step": 21802 + }, + { + "epoch": 2.773565704108892, + "grad_norm": 1.5415420532226562, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8867471814155579, + "num_tokens": 831913118.0, + "step": 21803 + }, + { + "epoch": 2.7736929143874827, + "grad_norm": 1.502663493156433, + "learning_rate": 1e-06, + "loss": 0.2759, + "mean_token_accuracy": 0.8978021144866943, + "num_tokens": 831950661.0, + "step": 21804 + }, + { + "epoch": 2.7738201246660728, + "grad_norm": 1.4297977685928345, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8802745342254639, + "num_tokens": 831989506.0, + "step": 21805 + }, + { + "epoch": 2.7739473349446637, + "grad_norm": 1.6606053113937378, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8810356855392456, + "num_tokens": 832027971.0, + "step": 21806 + }, + { + "epoch": 2.774074545223254, + "grad_norm": 1.8037266731262207, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8750958442687988, + "num_tokens": 832060955.0, + "step": 21807 + }, + { + "epoch": 2.774201755501845, + "grad_norm": 1.719234585762024, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8925893306732178, + "num_tokens": 832097110.0, + "step": 21808 + }, + { + "epoch": 2.774328965780435, + "grad_norm": 1.6587806940078735, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8896169662475586, + "num_tokens": 832129478.0, + "step": 21809 + }, + { + "epoch": 2.774456176059026, + "grad_norm": 1.7457784414291382, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.888431191444397, + "num_tokens": 832162549.0, + "step": 21810 + }, + { + "epoch": 2.774583386337616, + "grad_norm": 1.6212760210037231, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8891913294792175, + "num_tokens": 832201145.0, + "step": 21811 + }, + { + "epoch": 2.7747105966162064, + "grad_norm": 1.5535662174224854, + "learning_rate": 1e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.8989492058753967, + "num_tokens": 832236862.0, + "step": 21812 + }, + { + "epoch": 2.774837806894797, + "grad_norm": 1.5455877780914307, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8973304629325867, + "num_tokens": 832275292.0, + "step": 21813 + }, + { + "epoch": 2.7749650171733875, + "grad_norm": 1.6058268547058105, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8814821243286133, + "num_tokens": 832311864.0, + "step": 21814 + }, + { + "epoch": 2.775092227451978, + "grad_norm": 1.663735032081604, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8885196447372437, + "num_tokens": 832352047.0, + "step": 21815 + }, + { + "epoch": 2.7752194377305686, + "grad_norm": 1.6736029386520386, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8728171586990356, + "num_tokens": 832386608.0, + "step": 21816 + }, + { + "epoch": 2.775346648009159, + "grad_norm": 1.5460175275802612, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.892619788646698, + "num_tokens": 832426924.0, + "step": 21817 + }, + { + "epoch": 2.7754738582877496, + "grad_norm": 1.5555545091629028, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8870076537132263, + "num_tokens": 832466619.0, + "step": 21818 + }, + { + "epoch": 2.77560106856634, + "grad_norm": 1.5501775741577148, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8942840099334717, + "num_tokens": 832505944.0, + "step": 21819 + }, + { + "epoch": 2.7757282788449307, + "grad_norm": 1.6269692182540894, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8745790719985962, + "num_tokens": 832544462.0, + "step": 21820 + }, + { + "epoch": 2.775855489123521, + "grad_norm": 1.5954411029815674, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.870980441570282, + "num_tokens": 832586175.0, + "step": 21821 + }, + { + "epoch": 2.7759826994021117, + "grad_norm": 1.5880464315414429, + "learning_rate": 1e-06, + "loss": 0.2812, + "mean_token_accuracy": 0.8987182974815369, + "num_tokens": 832620818.0, + "step": 21822 + }, + { + "epoch": 2.7761099096807023, + "grad_norm": 1.6288559436798096, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8913557529449463, + "num_tokens": 832652365.0, + "step": 21823 + }, + { + "epoch": 2.776237119959293, + "grad_norm": 1.5882893800735474, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8856618404388428, + "num_tokens": 832691332.0, + "step": 21824 + }, + { + "epoch": 2.7763643302378833, + "grad_norm": 1.4541867971420288, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8820722103118896, + "num_tokens": 832736531.0, + "step": 21825 + }, + { + "epoch": 2.776491540516474, + "grad_norm": 1.6170090436935425, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8937245607376099, + "num_tokens": 832769730.0, + "step": 21826 + }, + { + "epoch": 2.7766187507950644, + "grad_norm": 1.6973469257354736, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8887699842453003, + "num_tokens": 832806547.0, + "step": 21827 + }, + { + "epoch": 2.776745961073655, + "grad_norm": 1.8692010641098022, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8858950138092041, + "num_tokens": 832840661.0, + "step": 21828 + }, + { + "epoch": 2.7768731713522454, + "grad_norm": 1.6000930070877075, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8878211975097656, + "num_tokens": 832877400.0, + "step": 21829 + }, + { + "epoch": 2.7770003816308355, + "grad_norm": 1.5630322694778442, + "learning_rate": 1e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.8917707204818726, + "num_tokens": 832913454.0, + "step": 21830 + }, + { + "epoch": 2.7771275919094265, + "grad_norm": 1.394282341003418, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8862556219100952, + "num_tokens": 832958261.0, + "step": 21831 + }, + { + "epoch": 2.7772548021880166, + "grad_norm": 1.5534343719482422, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8804347515106201, + "num_tokens": 832996045.0, + "step": 21832 + }, + { + "epoch": 2.7773820124666075, + "grad_norm": 1.6310235261917114, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.882677435874939, + "num_tokens": 833037022.0, + "step": 21833 + }, + { + "epoch": 2.7775092227451976, + "grad_norm": 1.5349026918411255, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.884968638420105, + "num_tokens": 833078987.0, + "step": 21834 + }, + { + "epoch": 2.7776364330237886, + "grad_norm": 1.7865676879882812, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8811630606651306, + "num_tokens": 833110017.0, + "step": 21835 + }, + { + "epoch": 2.7777636433023787, + "grad_norm": 1.4946039915084839, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8890448808670044, + "num_tokens": 833154045.0, + "step": 21836 + }, + { + "epoch": 2.777890853580969, + "grad_norm": 1.5103355646133423, + "learning_rate": 1e-06, + "loss": 0.2713, + "mean_token_accuracy": 0.9005613327026367, + "num_tokens": 833192530.0, + "step": 21837 + }, + { + "epoch": 2.7780180638595597, + "grad_norm": 1.686389684677124, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8807384967803955, + "num_tokens": 833227498.0, + "step": 21838 + }, + { + "epoch": 2.7781452741381503, + "grad_norm": 1.5513368844985962, + "learning_rate": 1e-06, + "loss": 0.2858, + "mean_token_accuracy": 0.8967689275741577, + "num_tokens": 833265792.0, + "step": 21839 + }, + { + "epoch": 2.778272484416741, + "grad_norm": 1.4205009937286377, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8826887607574463, + "num_tokens": 833310473.0, + "step": 21840 + }, + { + "epoch": 2.7783996946953313, + "grad_norm": 1.535029411315918, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8841980695724487, + "num_tokens": 833351286.0, + "step": 21841 + }, + { + "epoch": 2.778526904973922, + "grad_norm": 1.6010836362838745, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8790467381477356, + "num_tokens": 833391388.0, + "step": 21842 + }, + { + "epoch": 2.7786541152525124, + "grad_norm": 1.6120691299438477, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8884909749031067, + "num_tokens": 833428338.0, + "step": 21843 + }, + { + "epoch": 2.778781325531103, + "grad_norm": 1.5987329483032227, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8833688497543335, + "num_tokens": 833462704.0, + "step": 21844 + }, + { + "epoch": 2.7789085358096934, + "grad_norm": 1.6148889064788818, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.877530574798584, + "num_tokens": 833500651.0, + "step": 21845 + }, + { + "epoch": 2.779035746088284, + "grad_norm": 1.5792969465255737, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8945306539535522, + "num_tokens": 833534970.0, + "step": 21846 + }, + { + "epoch": 2.7791629563668745, + "grad_norm": 1.4749130010604858, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8857054114341736, + "num_tokens": 833574138.0, + "step": 21847 + }, + { + "epoch": 2.779290166645465, + "grad_norm": 1.5633304119110107, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8784375190734863, + "num_tokens": 833615535.0, + "step": 21848 + }, + { + "epoch": 2.7794173769240555, + "grad_norm": 1.6430059671401978, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8891048431396484, + "num_tokens": 833651198.0, + "step": 21849 + }, + { + "epoch": 2.779544587202646, + "grad_norm": 1.758641242980957, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8786241412162781, + "num_tokens": 833683303.0, + "step": 21850 + }, + { + "epoch": 2.7796717974812366, + "grad_norm": 1.396284580230713, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8861647248268127, + "num_tokens": 833729397.0, + "step": 21851 + }, + { + "epoch": 2.779799007759827, + "grad_norm": 1.5209087133407593, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8893492817878723, + "num_tokens": 833766728.0, + "step": 21852 + }, + { + "epoch": 2.7799262180384177, + "grad_norm": 1.6848424673080444, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8798198699951172, + "num_tokens": 833802118.0, + "step": 21853 + }, + { + "epoch": 2.780053428317008, + "grad_norm": 1.6948612928390503, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8709219098091125, + "num_tokens": 833835948.0, + "step": 21854 + }, + { + "epoch": 2.7801806385955983, + "grad_norm": 1.502875804901123, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8934993147850037, + "num_tokens": 833870229.0, + "step": 21855 + }, + { + "epoch": 2.7803078488741892, + "grad_norm": 1.59456205368042, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8960022926330566, + "num_tokens": 833906562.0, + "step": 21856 + }, + { + "epoch": 2.7804350591527793, + "grad_norm": 1.5794943571090698, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8869419693946838, + "num_tokens": 833942894.0, + "step": 21857 + }, + { + "epoch": 2.7805622694313703, + "grad_norm": 1.4638357162475586, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8823994994163513, + "num_tokens": 833983927.0, + "step": 21858 + }, + { + "epoch": 2.7806894797099604, + "grad_norm": 1.6494202613830566, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8873656988143921, + "num_tokens": 834016813.0, + "step": 21859 + }, + { + "epoch": 2.780816689988551, + "grad_norm": 1.480136752128601, + "learning_rate": 1e-06, + "loss": 0.2677, + "mean_token_accuracy": 0.9005045890808105, + "num_tokens": 834054137.0, + "step": 21860 + }, + { + "epoch": 2.7809439002671414, + "grad_norm": 1.6181857585906982, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8807528018951416, + "num_tokens": 834090151.0, + "step": 21861 + }, + { + "epoch": 2.781071110545732, + "grad_norm": 1.448399305343628, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8876558542251587, + "num_tokens": 834132591.0, + "step": 21862 + }, + { + "epoch": 2.7811983208243225, + "grad_norm": 1.5539019107818604, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8747739791870117, + "num_tokens": 834173841.0, + "step": 21863 + }, + { + "epoch": 2.781325531102913, + "grad_norm": 1.7481379508972168, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8750717043876648, + "num_tokens": 834205904.0, + "step": 21864 + }, + { + "epoch": 2.7814527413815036, + "grad_norm": 1.6003576517105103, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8930399417877197, + "num_tokens": 834238283.0, + "step": 21865 + }, + { + "epoch": 2.781579951660094, + "grad_norm": 1.6960853338241577, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8892530202865601, + "num_tokens": 834271747.0, + "step": 21866 + }, + { + "epoch": 2.7817071619386846, + "grad_norm": 1.6775842905044556, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8791091442108154, + "num_tokens": 834308353.0, + "step": 21867 + }, + { + "epoch": 2.781834372217275, + "grad_norm": 1.5566023588180542, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8904884457588196, + "num_tokens": 834346473.0, + "step": 21868 + }, + { + "epoch": 2.7819615824958657, + "grad_norm": 1.6094304323196411, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8877707719802856, + "num_tokens": 834380938.0, + "step": 21869 + }, + { + "epoch": 2.782088792774456, + "grad_norm": 1.6462020874023438, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8877993822097778, + "num_tokens": 834411946.0, + "step": 21870 + }, + { + "epoch": 2.7822160030530467, + "grad_norm": 1.6076432466506958, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8877789378166199, + "num_tokens": 834446923.0, + "step": 21871 + }, + { + "epoch": 2.7823432133316373, + "grad_norm": 1.5023818016052246, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8885320425033569, + "num_tokens": 834483712.0, + "step": 21872 + }, + { + "epoch": 2.782470423610228, + "grad_norm": 1.4839158058166504, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8827453851699829, + "num_tokens": 834523113.0, + "step": 21873 + }, + { + "epoch": 2.7825976338888183, + "grad_norm": 1.7348891496658325, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8796328902244568, + "num_tokens": 834554904.0, + "step": 21874 + }, + { + "epoch": 2.782724844167409, + "grad_norm": 1.6153956651687622, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8959870338439941, + "num_tokens": 834590538.0, + "step": 21875 + }, + { + "epoch": 2.7828520544459994, + "grad_norm": 1.5914438962936401, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8837610483169556, + "num_tokens": 834626024.0, + "step": 21876 + }, + { + "epoch": 2.78297926472459, + "grad_norm": 1.404189109802246, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8906075954437256, + "num_tokens": 834666500.0, + "step": 21877 + }, + { + "epoch": 2.78310647500318, + "grad_norm": 1.6865501403808594, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8874150514602661, + "num_tokens": 834701377.0, + "step": 21878 + }, + { + "epoch": 2.783233685281771, + "grad_norm": 1.4346650838851929, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8787853121757507, + "num_tokens": 834742941.0, + "step": 21879 + }, + { + "epoch": 2.783360895560361, + "grad_norm": 1.64261794090271, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8837940692901611, + "num_tokens": 834776789.0, + "step": 21880 + }, + { + "epoch": 2.783488105838952, + "grad_norm": 1.6138559579849243, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8812890648841858, + "num_tokens": 834813707.0, + "step": 21881 + }, + { + "epoch": 2.783615316117542, + "grad_norm": 1.4959992170333862, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8746786117553711, + "num_tokens": 834858519.0, + "step": 21882 + }, + { + "epoch": 2.783742526396133, + "grad_norm": 1.3669229745864868, + "learning_rate": 1e-06, + "loss": 0.2635, + "mean_token_accuracy": 0.9036330580711365, + "num_tokens": 834900122.0, + "step": 21883 + }, + { + "epoch": 2.783869736674723, + "grad_norm": 1.6600855588912964, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8712140321731567, + "num_tokens": 834940399.0, + "step": 21884 + }, + { + "epoch": 2.7839969469533137, + "grad_norm": 1.594323754310608, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8732262849807739, + "num_tokens": 834981253.0, + "step": 21885 + }, + { + "epoch": 2.784124157231904, + "grad_norm": 1.5376619100570679, + "learning_rate": 1e-06, + "loss": 0.277, + "mean_token_accuracy": 0.9026371240615845, + "num_tokens": 835018722.0, + "step": 21886 + }, + { + "epoch": 2.7842513675104947, + "grad_norm": 1.5557693243026733, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8608019351959229, + "num_tokens": 835062154.0, + "step": 21887 + }, + { + "epoch": 2.7843785777890853, + "grad_norm": 1.528169870376587, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.888256311416626, + "num_tokens": 835101924.0, + "step": 21888 + }, + { + "epoch": 2.784505788067676, + "grad_norm": 1.7471399307250977, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8857234120368958, + "num_tokens": 835133009.0, + "step": 21889 + }, + { + "epoch": 2.7846329983462663, + "grad_norm": 1.458864688873291, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8776941299438477, + "num_tokens": 835177316.0, + "step": 21890 + }, + { + "epoch": 2.784760208624857, + "grad_norm": 1.6277273893356323, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8757607340812683, + "num_tokens": 835214853.0, + "step": 21891 + }, + { + "epoch": 2.7848874189034474, + "grad_norm": 1.5504388809204102, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8831546306610107, + "num_tokens": 835249178.0, + "step": 21892 + }, + { + "epoch": 2.785014629182038, + "grad_norm": 1.602318525314331, + "learning_rate": 1e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.8959245085716248, + "num_tokens": 835280559.0, + "step": 21893 + }, + { + "epoch": 2.7851418394606284, + "grad_norm": 1.647487759590149, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8855394124984741, + "num_tokens": 835317892.0, + "step": 21894 + }, + { + "epoch": 2.785269049739219, + "grad_norm": 1.625535488128662, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8906745314598083, + "num_tokens": 835351699.0, + "step": 21895 + }, + { + "epoch": 2.7853962600178095, + "grad_norm": 1.4569611549377441, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.883094072341919, + "num_tokens": 835393887.0, + "step": 21896 + }, + { + "epoch": 2.7855234702964, + "grad_norm": 1.44840669631958, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8792651891708374, + "num_tokens": 835435551.0, + "step": 21897 + }, + { + "epoch": 2.7856506805749905, + "grad_norm": 1.5272518396377563, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8952517509460449, + "num_tokens": 835474046.0, + "step": 21898 + }, + { + "epoch": 2.785777890853581, + "grad_norm": 1.4462721347808838, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8857152462005615, + "num_tokens": 835515761.0, + "step": 21899 + }, + { + "epoch": 2.7859051011321716, + "grad_norm": 1.5063055753707886, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8925025463104248, + "num_tokens": 835554332.0, + "step": 21900 + }, + { + "epoch": 2.786032311410762, + "grad_norm": 1.5567262172698975, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8926539421081543, + "num_tokens": 835592583.0, + "step": 21901 + }, + { + "epoch": 2.7861595216893527, + "grad_norm": 1.612608790397644, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8777472972869873, + "num_tokens": 835632271.0, + "step": 21902 + }, + { + "epoch": 2.7862867319679427, + "grad_norm": 1.734049916267395, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8756003975868225, + "num_tokens": 835665480.0, + "step": 21903 + }, + { + "epoch": 2.7864139422465337, + "grad_norm": 1.4968719482421875, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8847008347511292, + "num_tokens": 835707604.0, + "step": 21904 + }, + { + "epoch": 2.786541152525124, + "grad_norm": 1.444658875465393, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8973395824432373, + "num_tokens": 835748248.0, + "step": 21905 + }, + { + "epoch": 2.7866683628037148, + "grad_norm": 1.4939559698104858, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8893614411354065, + "num_tokens": 835788674.0, + "step": 21906 + }, + { + "epoch": 2.786795573082305, + "grad_norm": 1.5214606523513794, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.89606773853302, + "num_tokens": 835825776.0, + "step": 21907 + }, + { + "epoch": 2.786922783360896, + "grad_norm": 1.7061859369277954, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8609201312065125, + "num_tokens": 835864124.0, + "step": 21908 + }, + { + "epoch": 2.787049993639486, + "grad_norm": 1.4771649837493896, + "learning_rate": 1e-06, + "loss": 0.273, + "mean_token_accuracy": 0.9011675119400024, + "num_tokens": 835904783.0, + "step": 21909 + }, + { + "epoch": 2.7871772039180764, + "grad_norm": 1.6570775508880615, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8854749202728271, + "num_tokens": 835938609.0, + "step": 21910 + }, + { + "epoch": 2.787304414196667, + "grad_norm": 1.5074608325958252, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8896030783653259, + "num_tokens": 835979508.0, + "step": 21911 + }, + { + "epoch": 2.7874316244752575, + "grad_norm": 1.55264151096344, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8871387243270874, + "num_tokens": 836017548.0, + "step": 21912 + }, + { + "epoch": 2.787558834753848, + "grad_norm": 1.6101471185684204, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8868882060050964, + "num_tokens": 836055512.0, + "step": 21913 + }, + { + "epoch": 2.7876860450324386, + "grad_norm": 1.5552555322647095, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8741657137870789, + "num_tokens": 836093986.0, + "step": 21914 + }, + { + "epoch": 2.787813255311029, + "grad_norm": 1.5794984102249146, + "learning_rate": 1e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.8941070437431335, + "num_tokens": 836130449.0, + "step": 21915 + }, + { + "epoch": 2.7879404655896196, + "grad_norm": 1.5200743675231934, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8863809704780579, + "num_tokens": 836172008.0, + "step": 21916 + }, + { + "epoch": 2.78806767586821, + "grad_norm": 1.6206165552139282, + "learning_rate": 1e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.8947423696517944, + "num_tokens": 836204336.0, + "step": 21917 + }, + { + "epoch": 2.7881948861468007, + "grad_norm": 1.5417912006378174, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8908162713050842, + "num_tokens": 836238649.0, + "step": 21918 + }, + { + "epoch": 2.788322096425391, + "grad_norm": 1.661892056465149, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.880986213684082, + "num_tokens": 836277036.0, + "step": 21919 + }, + { + "epoch": 2.7884493067039817, + "grad_norm": 1.5703870058059692, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8921175003051758, + "num_tokens": 836315219.0, + "step": 21920 + }, + { + "epoch": 2.7885765169825723, + "grad_norm": 1.5450588464736938, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8775302171707153, + "num_tokens": 836357297.0, + "step": 21921 + }, + { + "epoch": 2.788703727261163, + "grad_norm": 1.7477747201919556, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8823773860931396, + "num_tokens": 836390032.0, + "step": 21922 + }, + { + "epoch": 2.7888309375397533, + "grad_norm": 1.3997443914413452, + "learning_rate": 1e-06, + "loss": 0.2884, + "mean_token_accuracy": 0.8932722806930542, + "num_tokens": 836432260.0, + "step": 21923 + }, + { + "epoch": 2.788958147818344, + "grad_norm": 1.5591789484024048, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8889659643173218, + "num_tokens": 836470976.0, + "step": 21924 + }, + { + "epoch": 2.7890853580969344, + "grad_norm": 1.506977915763855, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8916388750076294, + "num_tokens": 836508951.0, + "step": 21925 + }, + { + "epoch": 2.789212568375525, + "grad_norm": 1.4650943279266357, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8889514207839966, + "num_tokens": 836553621.0, + "step": 21926 + }, + { + "epoch": 2.7893397786541154, + "grad_norm": 1.6186007261276245, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8712496757507324, + "num_tokens": 836594133.0, + "step": 21927 + }, + { + "epoch": 2.7894669889327055, + "grad_norm": 1.528860330581665, + "learning_rate": 1e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.8956554532051086, + "num_tokens": 836628582.0, + "step": 21928 + }, + { + "epoch": 2.7895941992112965, + "grad_norm": 1.5396085977554321, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8928557634353638, + "num_tokens": 836665215.0, + "step": 21929 + }, + { + "epoch": 2.7897214094898866, + "grad_norm": 1.85137140750885, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8851158022880554, + "num_tokens": 836694155.0, + "step": 21930 + }, + { + "epoch": 2.7898486197684775, + "grad_norm": 1.5537055730819702, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8867254257202148, + "num_tokens": 836732367.0, + "step": 21931 + }, + { + "epoch": 2.7899758300470676, + "grad_norm": 1.5475828647613525, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8832600712776184, + "num_tokens": 836769911.0, + "step": 21932 + }, + { + "epoch": 2.7901030403256586, + "grad_norm": 1.6265791654586792, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8917791247367859, + "num_tokens": 836804330.0, + "step": 21933 + }, + { + "epoch": 2.7902302506042487, + "grad_norm": 1.5665143728256226, + "learning_rate": 1e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.8917726278305054, + "num_tokens": 836835695.0, + "step": 21934 + }, + { + "epoch": 2.790357460882839, + "grad_norm": 1.8983652591705322, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.867489218711853, + "num_tokens": 836866119.0, + "step": 21935 + }, + { + "epoch": 2.7904846711614297, + "grad_norm": 1.68035089969635, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8840987682342529, + "num_tokens": 836898450.0, + "step": 21936 + }, + { + "epoch": 2.7906118814400203, + "grad_norm": 1.604661226272583, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8695187568664551, + "num_tokens": 836939757.0, + "step": 21937 + }, + { + "epoch": 2.790739091718611, + "grad_norm": 1.5770000219345093, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8876461386680603, + "num_tokens": 836978219.0, + "step": 21938 + }, + { + "epoch": 2.7908663019972013, + "grad_norm": 1.6865524053573608, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8789932727813721, + "num_tokens": 837013355.0, + "step": 21939 + }, + { + "epoch": 2.790993512275792, + "grad_norm": 1.5931264162063599, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8895745277404785, + "num_tokens": 837050745.0, + "step": 21940 + }, + { + "epoch": 2.7911207225543824, + "grad_norm": 1.623339056968689, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8826969265937805, + "num_tokens": 837084122.0, + "step": 21941 + }, + { + "epoch": 2.791247932832973, + "grad_norm": 1.5202523469924927, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8849391937255859, + "num_tokens": 837123448.0, + "step": 21942 + }, + { + "epoch": 2.7913751431115634, + "grad_norm": 1.6452791690826416, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8960385322570801, + "num_tokens": 837155007.0, + "step": 21943 + }, + { + "epoch": 2.791502353390154, + "grad_norm": 1.71116042137146, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.887121856212616, + "num_tokens": 837185770.0, + "step": 21944 + }, + { + "epoch": 2.7916295636687445, + "grad_norm": 1.5138859748840332, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8781126737594604, + "num_tokens": 837227101.0, + "step": 21945 + }, + { + "epoch": 2.791756773947335, + "grad_norm": 1.49624502658844, + "learning_rate": 1e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.8960856199264526, + "num_tokens": 837262749.0, + "step": 21946 + }, + { + "epoch": 2.7918839842259255, + "grad_norm": 1.521308183670044, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8873305320739746, + "num_tokens": 837300552.0, + "step": 21947 + }, + { + "epoch": 2.792011194504516, + "grad_norm": 1.5208146572113037, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8865317106246948, + "num_tokens": 837339758.0, + "step": 21948 + }, + { + "epoch": 2.7921384047831066, + "grad_norm": 1.5515474081039429, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8892453908920288, + "num_tokens": 837380473.0, + "step": 21949 + }, + { + "epoch": 2.792265615061697, + "grad_norm": 1.4940071105957031, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.887597382068634, + "num_tokens": 837418244.0, + "step": 21950 + }, + { + "epoch": 2.7923928253402877, + "grad_norm": 1.4865913391113281, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8770350217819214, + "num_tokens": 837458147.0, + "step": 21951 + }, + { + "epoch": 2.792520035618878, + "grad_norm": 1.3460311889648438, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8942375183105469, + "num_tokens": 837501821.0, + "step": 21952 + }, + { + "epoch": 2.7926472458974683, + "grad_norm": 1.6279939413070679, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8925831317901611, + "num_tokens": 837536407.0, + "step": 21953 + }, + { + "epoch": 2.7927744561760592, + "grad_norm": 1.623896837234497, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8826811909675598, + "num_tokens": 837571647.0, + "step": 21954 + }, + { + "epoch": 2.7929016664546493, + "grad_norm": 1.6503214836120605, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8773400783538818, + "num_tokens": 837609071.0, + "step": 21955 + }, + { + "epoch": 2.7930288767332403, + "grad_norm": 1.5328346490859985, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8888105750083923, + "num_tokens": 837650169.0, + "step": 21956 + }, + { + "epoch": 2.7931560870118304, + "grad_norm": 1.7045905590057373, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8919768929481506, + "num_tokens": 837680854.0, + "step": 21957 + }, + { + "epoch": 2.793283297290421, + "grad_norm": 1.620613694190979, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8918582201004028, + "num_tokens": 837715704.0, + "step": 21958 + }, + { + "epoch": 2.7934105075690114, + "grad_norm": 1.527613878250122, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.876366138458252, + "num_tokens": 837754211.0, + "step": 21959 + }, + { + "epoch": 2.793537717847602, + "grad_norm": 1.623598575592041, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.883073627948761, + "num_tokens": 837789833.0, + "step": 21960 + }, + { + "epoch": 2.7936649281261925, + "grad_norm": 1.4680746793746948, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8664367198944092, + "num_tokens": 837836248.0, + "step": 21961 + }, + { + "epoch": 2.793792138404783, + "grad_norm": 1.5026081800460815, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8797042369842529, + "num_tokens": 837880066.0, + "step": 21962 + }, + { + "epoch": 2.7939193486833735, + "grad_norm": 1.4912807941436768, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8874399662017822, + "num_tokens": 837917604.0, + "step": 21963 + }, + { + "epoch": 2.794046558961964, + "grad_norm": 1.7150582075119019, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8825567960739136, + "num_tokens": 837949802.0, + "step": 21964 + }, + { + "epoch": 2.7941737692405546, + "grad_norm": 1.8077592849731445, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.875510036945343, + "num_tokens": 837984486.0, + "step": 21965 + }, + { + "epoch": 2.794300979519145, + "grad_norm": 1.5570696592330933, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8866085410118103, + "num_tokens": 838024577.0, + "step": 21966 + }, + { + "epoch": 2.7944281897977357, + "grad_norm": 1.5520107746124268, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8814959526062012, + "num_tokens": 838063246.0, + "step": 21967 + }, + { + "epoch": 2.794555400076326, + "grad_norm": 1.592246413230896, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8807291984558105, + "num_tokens": 838101802.0, + "step": 21968 + }, + { + "epoch": 2.7946826103549167, + "grad_norm": 1.6832728385925293, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8708128929138184, + "num_tokens": 838142551.0, + "step": 21969 + }, + { + "epoch": 2.7948098206335072, + "grad_norm": 1.5375196933746338, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.893942654132843, + "num_tokens": 838178954.0, + "step": 21970 + }, + { + "epoch": 2.7949370309120978, + "grad_norm": 1.6248724460601807, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8779338002204895, + "num_tokens": 838218443.0, + "step": 21971 + }, + { + "epoch": 2.7950642411906883, + "grad_norm": 1.3975603580474854, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8890734314918518, + "num_tokens": 838262796.0, + "step": 21972 + }, + { + "epoch": 2.795191451469279, + "grad_norm": 1.4919418096542358, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.895652174949646, + "num_tokens": 838300948.0, + "step": 21973 + }, + { + "epoch": 2.7953186617478694, + "grad_norm": 1.559287428855896, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8889847993850708, + "num_tokens": 838336279.0, + "step": 21974 + }, + { + "epoch": 2.79544587202646, + "grad_norm": 1.4927726984024048, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8677315711975098, + "num_tokens": 838383467.0, + "step": 21975 + }, + { + "epoch": 2.79557308230505, + "grad_norm": 1.5798718929290771, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8889498710632324, + "num_tokens": 838418993.0, + "step": 21976 + }, + { + "epoch": 2.795700292583641, + "grad_norm": 1.5074262619018555, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8797072172164917, + "num_tokens": 838460394.0, + "step": 21977 + }, + { + "epoch": 2.795827502862231, + "grad_norm": 1.5605164766311646, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8866193294525146, + "num_tokens": 838497312.0, + "step": 21978 + }, + { + "epoch": 2.795954713140822, + "grad_norm": 1.4598727226257324, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8824176788330078, + "num_tokens": 838539337.0, + "step": 21979 + }, + { + "epoch": 2.796081923419412, + "grad_norm": 1.6922709941864014, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8924069404602051, + "num_tokens": 838570918.0, + "step": 21980 + }, + { + "epoch": 2.796209133698003, + "grad_norm": 1.4449589252471924, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8841073513031006, + "num_tokens": 838610490.0, + "step": 21981 + }, + { + "epoch": 2.796336343976593, + "grad_norm": 1.6330655813217163, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8873142004013062, + "num_tokens": 838647226.0, + "step": 21982 + }, + { + "epoch": 2.7964635542551837, + "grad_norm": 1.4764659404754639, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8779469728469849, + "num_tokens": 838688156.0, + "step": 21983 + }, + { + "epoch": 2.796590764533774, + "grad_norm": 1.4288212060928345, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8875450491905212, + "num_tokens": 838732240.0, + "step": 21984 + }, + { + "epoch": 2.7967179748123647, + "grad_norm": 1.4908654689788818, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8711534738540649, + "num_tokens": 838777400.0, + "step": 21985 + }, + { + "epoch": 2.7968451850909553, + "grad_norm": 1.535931944847107, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8844667077064514, + "num_tokens": 838818813.0, + "step": 21986 + }, + { + "epoch": 2.796972395369546, + "grad_norm": 1.4947911500930786, + "learning_rate": 1e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.8945519328117371, + "num_tokens": 838854296.0, + "step": 21987 + }, + { + "epoch": 2.7970996056481363, + "grad_norm": 1.7229092121124268, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8588641881942749, + "num_tokens": 838891704.0, + "step": 21988 + }, + { + "epoch": 2.797226815926727, + "grad_norm": 1.6538890600204468, + "learning_rate": 1e-06, + "loss": 0.2752, + "mean_token_accuracy": 0.899477481842041, + "num_tokens": 838922382.0, + "step": 21989 + }, + { + "epoch": 2.7973540262053174, + "grad_norm": 1.6271779537200928, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8761380314826965, + "num_tokens": 838957805.0, + "step": 21990 + }, + { + "epoch": 2.797481236483908, + "grad_norm": 1.510282039642334, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.884032130241394, + "num_tokens": 838995811.0, + "step": 21991 + }, + { + "epoch": 2.7976084467624984, + "grad_norm": 1.5988658666610718, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8830417394638062, + "num_tokens": 839029914.0, + "step": 21992 + }, + { + "epoch": 2.797735657041089, + "grad_norm": 1.4656856060028076, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.891098141670227, + "num_tokens": 839068870.0, + "step": 21993 + }, + { + "epoch": 2.7978628673196795, + "grad_norm": 1.748141884803772, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8905311822891235, + "num_tokens": 839099561.0, + "step": 21994 + }, + { + "epoch": 2.79799007759827, + "grad_norm": 1.6173490285873413, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8823398351669312, + "num_tokens": 839138029.0, + "step": 21995 + }, + { + "epoch": 2.7981172878768605, + "grad_norm": 1.5002578496932983, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8868526220321655, + "num_tokens": 839179621.0, + "step": 21996 + }, + { + "epoch": 2.798244498155451, + "grad_norm": 1.4579497575759888, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8880944848060608, + "num_tokens": 839219838.0, + "step": 21997 + }, + { + "epoch": 2.7983717084340416, + "grad_norm": 1.3890715837478638, + "learning_rate": 1e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.8987191915512085, + "num_tokens": 839259869.0, + "step": 21998 + }, + { + "epoch": 2.798498918712632, + "grad_norm": 1.6133761405944824, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8829048871994019, + "num_tokens": 839296179.0, + "step": 21999 + }, + { + "epoch": 2.7986261289912227, + "grad_norm": 1.6864817142486572, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8810680508613586, + "num_tokens": 839333535.0, + "step": 22000 + }, + { + "epoch": 2.7987533392698127, + "grad_norm": 1.5516520738601685, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8874685764312744, + "num_tokens": 839372167.0, + "step": 22001 + }, + { + "epoch": 2.7988805495484037, + "grad_norm": 1.5536977052688599, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8903517127037048, + "num_tokens": 839408436.0, + "step": 22002 + }, + { + "epoch": 2.799007759826994, + "grad_norm": 1.6341063976287842, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8843060731887817, + "num_tokens": 839444453.0, + "step": 22003 + }, + { + "epoch": 2.7991349701055848, + "grad_norm": 1.487693428993225, + "learning_rate": 1e-06, + "loss": 0.2565, + "mean_token_accuracy": 0.9052287936210632, + "num_tokens": 839480775.0, + "step": 22004 + }, + { + "epoch": 2.799262180384175, + "grad_norm": 1.7478837966918945, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8737035393714905, + "num_tokens": 839516578.0, + "step": 22005 + }, + { + "epoch": 2.799389390662766, + "grad_norm": 1.534601092338562, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8915336728096008, + "num_tokens": 839555669.0, + "step": 22006 + }, + { + "epoch": 2.799516600941356, + "grad_norm": 1.5324642658233643, + "learning_rate": 1e-06, + "loss": 0.2816, + "mean_token_accuracy": 0.8985656499862671, + "num_tokens": 839592386.0, + "step": 22007 + }, + { + "epoch": 2.7996438112199464, + "grad_norm": 1.4724892377853394, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8888933062553406, + "num_tokens": 839632363.0, + "step": 22008 + }, + { + "epoch": 2.799771021498537, + "grad_norm": 1.6791383028030396, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8662167191505432, + "num_tokens": 839668182.0, + "step": 22009 + }, + { + "epoch": 2.7998982317771275, + "grad_norm": 1.5587679147720337, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8910543322563171, + "num_tokens": 839706436.0, + "step": 22010 + }, + { + "epoch": 2.800025442055718, + "grad_norm": 1.4916698932647705, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.893756628036499, + "num_tokens": 839741284.0, + "step": 22011 + }, + { + "epoch": 2.8001526523343085, + "grad_norm": 1.7287545204162598, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8691344261169434, + "num_tokens": 839774257.0, + "step": 22012 + }, + { + "epoch": 2.800279862612899, + "grad_norm": 1.6417421102523804, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8823301196098328, + "num_tokens": 839810323.0, + "step": 22013 + }, + { + "epoch": 2.8004070728914896, + "grad_norm": 1.6427137851715088, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.86780846118927, + "num_tokens": 839847972.0, + "step": 22014 + }, + { + "epoch": 2.80053428317008, + "grad_norm": 1.4564847946166992, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8833833932876587, + "num_tokens": 839890610.0, + "step": 22015 + }, + { + "epoch": 2.8006614934486707, + "grad_norm": 1.566022515296936, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8911113739013672, + "num_tokens": 839931694.0, + "step": 22016 + }, + { + "epoch": 2.800788703727261, + "grad_norm": 1.5290392637252808, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8949040174484253, + "num_tokens": 839964388.0, + "step": 22017 + }, + { + "epoch": 2.8009159140058517, + "grad_norm": 1.599661111831665, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8788434267044067, + "num_tokens": 840001560.0, + "step": 22018 + }, + { + "epoch": 2.8010431242844422, + "grad_norm": 1.3917220830917358, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8886837959289551, + "num_tokens": 840043607.0, + "step": 22019 + }, + { + "epoch": 2.8011703345630328, + "grad_norm": 1.431165337562561, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8852839469909668, + "num_tokens": 840085336.0, + "step": 22020 + }, + { + "epoch": 2.8012975448416233, + "grad_norm": 1.6037169694900513, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.888664722442627, + "num_tokens": 840121610.0, + "step": 22021 + }, + { + "epoch": 2.801424755120214, + "grad_norm": 1.5244112014770508, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.889285147190094, + "num_tokens": 840158982.0, + "step": 22022 + }, + { + "epoch": 2.8015519653988044, + "grad_norm": 1.4377633333206177, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.885460376739502, + "num_tokens": 840203197.0, + "step": 22023 + }, + { + "epoch": 2.801679175677395, + "grad_norm": 1.6500240564346313, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8746084570884705, + "num_tokens": 840241773.0, + "step": 22024 + }, + { + "epoch": 2.8018063859559854, + "grad_norm": 1.432085633277893, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8936165571212769, + "num_tokens": 840284054.0, + "step": 22025 + }, + { + "epoch": 2.8019335962345755, + "grad_norm": 1.534507393836975, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8856979012489319, + "num_tokens": 840327082.0, + "step": 22026 + }, + { + "epoch": 2.8020608065131665, + "grad_norm": 1.5868902206420898, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8851205110549927, + "num_tokens": 840364191.0, + "step": 22027 + }, + { + "epoch": 2.8021880167917566, + "grad_norm": 1.5464848279953003, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8845270872116089, + "num_tokens": 840403833.0, + "step": 22028 + }, + { + "epoch": 2.8023152270703475, + "grad_norm": 1.585801124572754, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8832558989524841, + "num_tokens": 840443751.0, + "step": 22029 + }, + { + "epoch": 2.8024424373489376, + "grad_norm": 1.6240874528884888, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.879525899887085, + "num_tokens": 840479004.0, + "step": 22030 + }, + { + "epoch": 2.8025696476275286, + "grad_norm": 1.559873104095459, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.890537679195404, + "num_tokens": 840516827.0, + "step": 22031 + }, + { + "epoch": 2.8026968579061187, + "grad_norm": 1.64544677734375, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.880056619644165, + "num_tokens": 840553810.0, + "step": 22032 + }, + { + "epoch": 2.802824068184709, + "grad_norm": 1.5413364171981812, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8860899806022644, + "num_tokens": 840593455.0, + "step": 22033 + }, + { + "epoch": 2.8029512784632997, + "grad_norm": 1.4557509422302246, + "learning_rate": 1e-06, + "loss": 0.2819, + "mean_token_accuracy": 0.9021144509315491, + "num_tokens": 840632630.0, + "step": 22034 + }, + { + "epoch": 2.8030784887418903, + "grad_norm": 1.6505298614501953, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.883841872215271, + "num_tokens": 840668985.0, + "step": 22035 + }, + { + "epoch": 2.803205699020481, + "grad_norm": 1.5631186962127686, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8807767629623413, + "num_tokens": 840710289.0, + "step": 22036 + }, + { + "epoch": 2.8033329092990713, + "grad_norm": 1.470942735671997, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8919403553009033, + "num_tokens": 840747534.0, + "step": 22037 + }, + { + "epoch": 2.803460119577662, + "grad_norm": 1.4495841264724731, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8974244594573975, + "num_tokens": 840788299.0, + "step": 22038 + }, + { + "epoch": 2.8035873298562524, + "grad_norm": 1.504123568534851, + "learning_rate": 1e-06, + "loss": 0.2817, + "mean_token_accuracy": 0.8985572457313538, + "num_tokens": 840822591.0, + "step": 22039 + }, + { + "epoch": 2.803714540134843, + "grad_norm": 1.7621370553970337, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.877738356590271, + "num_tokens": 840852174.0, + "step": 22040 + }, + { + "epoch": 2.8038417504134334, + "grad_norm": 1.411392092704773, + "learning_rate": 1e-06, + "loss": 0.2543, + "mean_token_accuracy": 0.9053316116333008, + "num_tokens": 840889739.0, + "step": 22041 + }, + { + "epoch": 2.803968960692024, + "grad_norm": 1.4487298727035522, + "learning_rate": 1e-06, + "loss": 0.2688, + "mean_token_accuracy": 0.9047229290008545, + "num_tokens": 840929669.0, + "step": 22042 + }, + { + "epoch": 2.8040961709706145, + "grad_norm": 1.562343955039978, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8879777193069458, + "num_tokens": 840971377.0, + "step": 22043 + }, + { + "epoch": 2.804223381249205, + "grad_norm": 1.644190788269043, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8914873600006104, + "num_tokens": 841005868.0, + "step": 22044 + }, + { + "epoch": 2.8043505915277955, + "grad_norm": 1.5888441801071167, + "learning_rate": 1e-06, + "loss": 0.2887, + "mean_token_accuracy": 0.892871618270874, + "num_tokens": 841039393.0, + "step": 22045 + }, + { + "epoch": 2.804477801806386, + "grad_norm": 1.6559526920318604, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8877761363983154, + "num_tokens": 841076183.0, + "step": 22046 + }, + { + "epoch": 2.8046050120849766, + "grad_norm": 1.5788040161132812, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8783484697341919, + "num_tokens": 841113523.0, + "step": 22047 + }, + { + "epoch": 2.804732222363567, + "grad_norm": 1.5165719985961914, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8854987621307373, + "num_tokens": 841153808.0, + "step": 22048 + }, + { + "epoch": 2.8048594326421576, + "grad_norm": 1.7867170572280884, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8832233548164368, + "num_tokens": 841184252.0, + "step": 22049 + }, + { + "epoch": 2.804986642920748, + "grad_norm": 1.7448623180389404, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8768073320388794, + "num_tokens": 841219140.0, + "step": 22050 + }, + { + "epoch": 2.8051138531993383, + "grad_norm": 1.4648314714431763, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8951541185379028, + "num_tokens": 841258935.0, + "step": 22051 + }, + { + "epoch": 2.8052410634779292, + "grad_norm": 1.6298662424087524, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8773065805435181, + "num_tokens": 841296757.0, + "step": 22052 + }, + { + "epoch": 2.8053682737565193, + "grad_norm": 1.5122367143630981, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.884445071220398, + "num_tokens": 841337894.0, + "step": 22053 + }, + { + "epoch": 2.8054954840351103, + "grad_norm": 1.6960848569869995, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8719276785850525, + "num_tokens": 841373995.0, + "step": 22054 + }, + { + "epoch": 2.8056226943137004, + "grad_norm": 1.635860800743103, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8784058094024658, + "num_tokens": 841410163.0, + "step": 22055 + }, + { + "epoch": 2.805749904592291, + "grad_norm": 1.6916478872299194, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8952513337135315, + "num_tokens": 841442275.0, + "step": 22056 + }, + { + "epoch": 2.8058771148708814, + "grad_norm": 1.5191526412963867, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8893523216247559, + "num_tokens": 841483783.0, + "step": 22057 + }, + { + "epoch": 2.806004325149472, + "grad_norm": 1.5805307626724243, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8843162059783936, + "num_tokens": 841524251.0, + "step": 22058 + }, + { + "epoch": 2.8061315354280625, + "grad_norm": 1.5298378467559814, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8869636058807373, + "num_tokens": 841568239.0, + "step": 22059 + }, + { + "epoch": 2.806258745706653, + "grad_norm": 1.6443383693695068, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8914979696273804, + "num_tokens": 841609432.0, + "step": 22060 + }, + { + "epoch": 2.8063859559852435, + "grad_norm": 1.520453691482544, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8869614601135254, + "num_tokens": 841652448.0, + "step": 22061 + }, + { + "epoch": 2.806513166263834, + "grad_norm": 1.379988431930542, + "learning_rate": 1e-06, + "loss": 0.261, + "mean_token_accuracy": 0.9071189761161804, + "num_tokens": 841694349.0, + "step": 22062 + }, + { + "epoch": 2.8066403765424246, + "grad_norm": 1.4045674800872803, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8954847455024719, + "num_tokens": 841738138.0, + "step": 22063 + }, + { + "epoch": 2.806767586821015, + "grad_norm": 1.5334630012512207, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.892214834690094, + "num_tokens": 841776800.0, + "step": 22064 + }, + { + "epoch": 2.8068947970996057, + "grad_norm": 1.439312219619751, + "learning_rate": 1e-06, + "loss": 0.2781, + "mean_token_accuracy": 0.8973974585533142, + "num_tokens": 841813976.0, + "step": 22065 + }, + { + "epoch": 2.807022007378196, + "grad_norm": 1.5562843084335327, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8708689212799072, + "num_tokens": 841855922.0, + "step": 22066 + }, + { + "epoch": 2.8071492176567867, + "grad_norm": 1.5949064493179321, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8846789598464966, + "num_tokens": 841894173.0, + "step": 22067 + }, + { + "epoch": 2.8072764279353772, + "grad_norm": 1.4864169359207153, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8899135589599609, + "num_tokens": 841931777.0, + "step": 22068 + }, + { + "epoch": 2.8074036382139678, + "grad_norm": 1.6120814085006714, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8747684955596924, + "num_tokens": 841968186.0, + "step": 22069 + }, + { + "epoch": 2.8075308484925583, + "grad_norm": 1.5706430673599243, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8870706558227539, + "num_tokens": 842007875.0, + "step": 22070 + }, + { + "epoch": 2.807658058771149, + "grad_norm": 1.5983312129974365, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8721998929977417, + "num_tokens": 842046463.0, + "step": 22071 + }, + { + "epoch": 2.8077852690497394, + "grad_norm": 1.6165568828582764, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8812898993492126, + "num_tokens": 842085061.0, + "step": 22072 + }, + { + "epoch": 2.80791247932833, + "grad_norm": 1.493043065071106, + "learning_rate": 1e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.8976529240608215, + "num_tokens": 842122017.0, + "step": 22073 + }, + { + "epoch": 2.80803968960692, + "grad_norm": 1.6626371145248413, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8887336254119873, + "num_tokens": 842159565.0, + "step": 22074 + }, + { + "epoch": 2.808166899885511, + "grad_norm": 1.5385193824768066, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8781035542488098, + "num_tokens": 842199103.0, + "step": 22075 + }, + { + "epoch": 2.808294110164101, + "grad_norm": 1.5398253202438354, + "learning_rate": 1e-06, + "loss": 0.2765, + "mean_token_accuracy": 0.9002238512039185, + "num_tokens": 842233431.0, + "step": 22076 + }, + { + "epoch": 2.808421320442692, + "grad_norm": 1.6814310550689697, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.8907849788665771, + "num_tokens": 842272398.0, + "step": 22077 + }, + { + "epoch": 2.808548530721282, + "grad_norm": 1.520693063735962, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8860020041465759, + "num_tokens": 842319817.0, + "step": 22078 + }, + { + "epoch": 2.808675740999873, + "grad_norm": 1.649773120880127, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8846077919006348, + "num_tokens": 842352948.0, + "step": 22079 + }, + { + "epoch": 2.808802951278463, + "grad_norm": 1.4905544519424438, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8863147497177124, + "num_tokens": 842396990.0, + "step": 22080 + }, + { + "epoch": 2.8089301615570537, + "grad_norm": 1.6321730613708496, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8786677718162537, + "num_tokens": 842434681.0, + "step": 22081 + }, + { + "epoch": 2.809057371835644, + "grad_norm": 1.6699903011322021, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8899023532867432, + "num_tokens": 842468750.0, + "step": 22082 + }, + { + "epoch": 2.8091845821142347, + "grad_norm": 1.5880409479141235, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8888441324234009, + "num_tokens": 842502695.0, + "step": 22083 + }, + { + "epoch": 2.8093117923928252, + "grad_norm": 1.6590033769607544, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8858964443206787, + "num_tokens": 842536573.0, + "step": 22084 + }, + { + "epoch": 2.8094390026714158, + "grad_norm": 1.6289538145065308, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8784826993942261, + "num_tokens": 842570753.0, + "step": 22085 + }, + { + "epoch": 2.8095662129500063, + "grad_norm": 1.5802462100982666, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8902473449707031, + "num_tokens": 842606871.0, + "step": 22086 + }, + { + "epoch": 2.809693423228597, + "grad_norm": 1.5587337017059326, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8866311311721802, + "num_tokens": 842645379.0, + "step": 22087 + }, + { + "epoch": 2.8098206335071874, + "grad_norm": 1.5912748575210571, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8663014769554138, + "num_tokens": 842685721.0, + "step": 22088 + }, + { + "epoch": 2.809947843785778, + "grad_norm": 1.6118836402893066, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8907039761543274, + "num_tokens": 842724743.0, + "step": 22089 + }, + { + "epoch": 2.8100750540643684, + "grad_norm": 1.6080564260482788, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8758174180984497, + "num_tokens": 842761534.0, + "step": 22090 + }, + { + "epoch": 2.810202264342959, + "grad_norm": 1.43814218044281, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8821290135383606, + "num_tokens": 842804887.0, + "step": 22091 + }, + { + "epoch": 2.8103294746215495, + "grad_norm": 1.5360575914382935, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8772861957550049, + "num_tokens": 842842756.0, + "step": 22092 + }, + { + "epoch": 2.81045668490014, + "grad_norm": 1.4908453226089478, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8921079635620117, + "num_tokens": 842883170.0, + "step": 22093 + }, + { + "epoch": 2.8105838951787305, + "grad_norm": 1.8361879587173462, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8807772397994995, + "num_tokens": 842913557.0, + "step": 22094 + }, + { + "epoch": 2.810711105457321, + "grad_norm": 1.6146793365478516, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8788474202156067, + "num_tokens": 842949400.0, + "step": 22095 + }, + { + "epoch": 2.8108383157359116, + "grad_norm": 1.6737829446792603, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8704524040222168, + "num_tokens": 842987536.0, + "step": 22096 + }, + { + "epoch": 2.810965526014502, + "grad_norm": 1.540047287940979, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8855159282684326, + "num_tokens": 843025621.0, + "step": 22097 + }, + { + "epoch": 2.8110927362930926, + "grad_norm": 1.5278419256210327, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8930745720863342, + "num_tokens": 843062572.0, + "step": 22098 + }, + { + "epoch": 2.8112199465716827, + "grad_norm": 1.6653255224227905, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8955807089805603, + "num_tokens": 843094966.0, + "step": 22099 + }, + { + "epoch": 2.8113471568502737, + "grad_norm": 1.581026315689087, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8749967813491821, + "num_tokens": 843133193.0, + "step": 22100 + }, + { + "epoch": 2.811474367128864, + "grad_norm": 1.470703363418579, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8890873193740845, + "num_tokens": 843172476.0, + "step": 22101 + }, + { + "epoch": 2.8116015774074548, + "grad_norm": 1.650759220123291, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8773404955863953, + "num_tokens": 843208680.0, + "step": 22102 + }, + { + "epoch": 2.811728787686045, + "grad_norm": 1.5994195938110352, + "learning_rate": 1e-06, + "loss": 0.2716, + "mean_token_accuracy": 0.9008811116218567, + "num_tokens": 843244776.0, + "step": 22103 + }, + { + "epoch": 2.811855997964636, + "grad_norm": 1.5308027267456055, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8830552101135254, + "num_tokens": 843282585.0, + "step": 22104 + }, + { + "epoch": 2.811983208243226, + "grad_norm": 1.6595921516418457, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8728394508361816, + "num_tokens": 843319036.0, + "step": 22105 + }, + { + "epoch": 2.8121104185218164, + "grad_norm": 1.6545310020446777, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.888849139213562, + "num_tokens": 843352573.0, + "step": 22106 + }, + { + "epoch": 2.812237628800407, + "grad_norm": 1.4647053480148315, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8874706029891968, + "num_tokens": 843392667.0, + "step": 22107 + }, + { + "epoch": 2.8123648390789975, + "grad_norm": 1.4928117990493774, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8893235921859741, + "num_tokens": 843436273.0, + "step": 22108 + }, + { + "epoch": 2.812492049357588, + "grad_norm": 1.4943362474441528, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8752984404563904, + "num_tokens": 843480576.0, + "step": 22109 + }, + { + "epoch": 2.8126192596361785, + "grad_norm": 1.4709912538528442, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8836933970451355, + "num_tokens": 843522587.0, + "step": 22110 + }, + { + "epoch": 2.812746469914769, + "grad_norm": 1.7644275426864624, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8833131790161133, + "num_tokens": 843554730.0, + "step": 22111 + }, + { + "epoch": 2.8128736801933596, + "grad_norm": 1.626876711845398, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.879058301448822, + "num_tokens": 843592961.0, + "step": 22112 + }, + { + "epoch": 2.81300089047195, + "grad_norm": 1.5806808471679688, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8867715001106262, + "num_tokens": 843629226.0, + "step": 22113 + }, + { + "epoch": 2.8131281007505406, + "grad_norm": 1.4650018215179443, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.871545672416687, + "num_tokens": 843674809.0, + "step": 22114 + }, + { + "epoch": 2.813255311029131, + "grad_norm": 1.7659050226211548, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.872273862361908, + "num_tokens": 843706863.0, + "step": 22115 + }, + { + "epoch": 2.8133825213077217, + "grad_norm": 1.4324016571044922, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8907827138900757, + "num_tokens": 843747915.0, + "step": 22116 + }, + { + "epoch": 2.8135097315863122, + "grad_norm": 1.5506900548934937, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8862884044647217, + "num_tokens": 843784982.0, + "step": 22117 + }, + { + "epoch": 2.8136369418649028, + "grad_norm": 1.800528883934021, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8903728723526001, + "num_tokens": 843814458.0, + "step": 22118 + }, + { + "epoch": 2.8137641521434933, + "grad_norm": 1.7048022747039795, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8621361255645752, + "num_tokens": 843851107.0, + "step": 22119 + }, + { + "epoch": 2.813891362422084, + "grad_norm": 1.4460667371749878, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8900651335716248, + "num_tokens": 843892036.0, + "step": 22120 + }, + { + "epoch": 2.8140185727006743, + "grad_norm": 1.426046371459961, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8865871429443359, + "num_tokens": 843936738.0, + "step": 22121 + }, + { + "epoch": 2.814145782979265, + "grad_norm": 1.5210907459259033, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8834367990493774, + "num_tokens": 843977557.0, + "step": 22122 + }, + { + "epoch": 2.8142729932578554, + "grad_norm": 1.5767788887023926, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8901697993278503, + "num_tokens": 844013297.0, + "step": 22123 + }, + { + "epoch": 2.8144002035364455, + "grad_norm": 1.5636094808578491, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8855072259902954, + "num_tokens": 844051816.0, + "step": 22124 + }, + { + "epoch": 2.8145274138150365, + "grad_norm": 1.4283722639083862, + "learning_rate": 1e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.8984243869781494, + "num_tokens": 844090004.0, + "step": 22125 + }, + { + "epoch": 2.8146546240936265, + "grad_norm": 1.4926795959472656, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.890011191368103, + "num_tokens": 844130217.0, + "step": 22126 + }, + { + "epoch": 2.8147818343722175, + "grad_norm": 1.5450598001480103, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8858839273452759, + "num_tokens": 844166333.0, + "step": 22127 + }, + { + "epoch": 2.8149090446508076, + "grad_norm": 1.597364902496338, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.881367564201355, + "num_tokens": 844200365.0, + "step": 22128 + }, + { + "epoch": 2.815036254929398, + "grad_norm": 1.4529536962509155, + "learning_rate": 1e-06, + "loss": 0.2807, + "mean_token_accuracy": 0.8981410264968872, + "num_tokens": 844241011.0, + "step": 22129 + }, + { + "epoch": 2.8151634652079887, + "grad_norm": 1.5542287826538086, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8808399438858032, + "num_tokens": 844281658.0, + "step": 22130 + }, + { + "epoch": 2.815290675486579, + "grad_norm": 1.7713348865509033, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8856416940689087, + "num_tokens": 844317026.0, + "step": 22131 + }, + { + "epoch": 2.8154178857651697, + "grad_norm": 1.5000334978103638, + "learning_rate": 1e-06, + "loss": 0.2825, + "mean_token_accuracy": 0.898255467414856, + "num_tokens": 844353584.0, + "step": 22132 + }, + { + "epoch": 2.8155450960437602, + "grad_norm": 1.5767496824264526, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8868028521537781, + "num_tokens": 844388699.0, + "step": 22133 + }, + { + "epoch": 2.8156723063223508, + "grad_norm": 1.5481175184249878, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8932070136070251, + "num_tokens": 844426585.0, + "step": 22134 + }, + { + "epoch": 2.8157995166009413, + "grad_norm": 1.6068131923675537, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8754609227180481, + "num_tokens": 844465850.0, + "step": 22135 + }, + { + "epoch": 2.815926726879532, + "grad_norm": 1.4827557802200317, + "learning_rate": 1e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.8981640934944153, + "num_tokens": 844504542.0, + "step": 22136 + }, + { + "epoch": 2.8160539371581224, + "grad_norm": 1.7539188861846924, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8636577129364014, + "num_tokens": 844539804.0, + "step": 22137 + }, + { + "epoch": 2.816181147436713, + "grad_norm": 1.628601312637329, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8724759817123413, + "num_tokens": 844575063.0, + "step": 22138 + }, + { + "epoch": 2.8163083577153034, + "grad_norm": 1.6713513135910034, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8812224864959717, + "num_tokens": 844610560.0, + "step": 22139 + }, + { + "epoch": 2.816435567993894, + "grad_norm": 1.4673408269882202, + "learning_rate": 1e-06, + "loss": 0.2801, + "mean_token_accuracy": 0.8977847099304199, + "num_tokens": 844646989.0, + "step": 22140 + }, + { + "epoch": 2.8165627782724845, + "grad_norm": 1.5452680587768555, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8882724642753601, + "num_tokens": 844686575.0, + "step": 22141 + }, + { + "epoch": 2.816689988551075, + "grad_norm": 1.579695701599121, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.889289140701294, + "num_tokens": 844724377.0, + "step": 22142 + }, + { + "epoch": 2.8168171988296655, + "grad_norm": 1.5410586595535278, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8891186714172363, + "num_tokens": 844762568.0, + "step": 22143 + }, + { + "epoch": 2.816944409108256, + "grad_norm": 1.4799681901931763, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8886862993240356, + "num_tokens": 844802855.0, + "step": 22144 + }, + { + "epoch": 2.8170716193868466, + "grad_norm": 1.4829576015472412, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.88749760389328, + "num_tokens": 844846216.0, + "step": 22145 + }, + { + "epoch": 2.817198829665437, + "grad_norm": 1.5435311794281006, + "learning_rate": 1e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.8973098993301392, + "num_tokens": 844881222.0, + "step": 22146 + }, + { + "epoch": 2.8173260399440276, + "grad_norm": 1.5702451467514038, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.891735315322876, + "num_tokens": 844917433.0, + "step": 22147 + }, + { + "epoch": 2.817453250222618, + "grad_norm": 1.6791528463363647, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8724732995033264, + "num_tokens": 844952523.0, + "step": 22148 + }, + { + "epoch": 2.8175804605012083, + "grad_norm": 1.6023386716842651, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8883666396141052, + "num_tokens": 844985939.0, + "step": 22149 + }, + { + "epoch": 2.8177076707797992, + "grad_norm": 1.5381423234939575, + "learning_rate": 1e-06, + "loss": 0.2887, + "mean_token_accuracy": 0.8963223695755005, + "num_tokens": 845020373.0, + "step": 22150 + }, + { + "epoch": 2.8178348810583893, + "grad_norm": 1.4980720281600952, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8827268481254578, + "num_tokens": 845060919.0, + "step": 22151 + }, + { + "epoch": 2.8179620913369803, + "grad_norm": 1.5327231884002686, + "learning_rate": 1e-06, + "loss": 0.2715, + "mean_token_accuracy": 0.9012095332145691, + "num_tokens": 845097664.0, + "step": 22152 + }, + { + "epoch": 2.8180893016155704, + "grad_norm": 1.5207384824752808, + "learning_rate": 1e-06, + "loss": 0.287, + "mean_token_accuracy": 0.8954271078109741, + "num_tokens": 845131820.0, + "step": 22153 + }, + { + "epoch": 2.818216511894161, + "grad_norm": 1.5596482753753662, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8878682255744934, + "num_tokens": 845168504.0, + "step": 22154 + }, + { + "epoch": 2.8183437221727514, + "grad_norm": 1.616445779800415, + "learning_rate": 1e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.8931029438972473, + "num_tokens": 845201661.0, + "step": 22155 + }, + { + "epoch": 2.818470932451342, + "grad_norm": 1.7132813930511475, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8833719491958618, + "num_tokens": 845235769.0, + "step": 22156 + }, + { + "epoch": 2.8185981427299325, + "grad_norm": 1.6798341274261475, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8928236365318298, + "num_tokens": 845267448.0, + "step": 22157 + }, + { + "epoch": 2.818725353008523, + "grad_norm": 1.6907192468643188, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8748799562454224, + "num_tokens": 845304848.0, + "step": 22158 + }, + { + "epoch": 2.8188525632871135, + "grad_norm": 1.4415498971939087, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8763443231582642, + "num_tokens": 845349006.0, + "step": 22159 + }, + { + "epoch": 2.818979773565704, + "grad_norm": 1.5241485834121704, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8882274031639099, + "num_tokens": 845385936.0, + "step": 22160 + }, + { + "epoch": 2.8191069838442946, + "grad_norm": 1.621126651763916, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8797416687011719, + "num_tokens": 845424359.0, + "step": 22161 + }, + { + "epoch": 2.819234194122885, + "grad_norm": 1.590300440788269, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8806923627853394, + "num_tokens": 845463338.0, + "step": 22162 + }, + { + "epoch": 2.8193614044014756, + "grad_norm": 1.5703338384628296, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8871017694473267, + "num_tokens": 845501029.0, + "step": 22163 + }, + { + "epoch": 2.819488614680066, + "grad_norm": 1.6136283874511719, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8711491227149963, + "num_tokens": 845541742.0, + "step": 22164 + }, + { + "epoch": 2.8196158249586567, + "grad_norm": 1.5715447664260864, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8901119232177734, + "num_tokens": 845578155.0, + "step": 22165 + }, + { + "epoch": 2.8197430352372472, + "grad_norm": 1.4925893545150757, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8925623893737793, + "num_tokens": 845615377.0, + "step": 22166 + }, + { + "epoch": 2.8198702455158378, + "grad_norm": 1.4904019832611084, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8939643502235413, + "num_tokens": 845656081.0, + "step": 22167 + }, + { + "epoch": 2.8199974557944283, + "grad_norm": 1.6196717023849487, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8866991400718689, + "num_tokens": 845688652.0, + "step": 22168 + }, + { + "epoch": 2.820124666073019, + "grad_norm": 1.4453115463256836, + "learning_rate": 1e-06, + "loss": 0.2791, + "mean_token_accuracy": 0.8979994654655457, + "num_tokens": 845731195.0, + "step": 22169 + }, + { + "epoch": 2.8202518763516093, + "grad_norm": 1.5824882984161377, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.8992642164230347, + "num_tokens": 845763168.0, + "step": 22170 + }, + { + "epoch": 2.8203790866302, + "grad_norm": 1.4199501276016235, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.884017825126648, + "num_tokens": 845808316.0, + "step": 22171 + }, + { + "epoch": 2.82050629690879, + "grad_norm": 1.5886298418045044, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8762956261634827, + "num_tokens": 845847426.0, + "step": 22172 + }, + { + "epoch": 2.820633507187381, + "grad_norm": 1.5603225231170654, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8934807777404785, + "num_tokens": 845884469.0, + "step": 22173 + }, + { + "epoch": 2.820760717465971, + "grad_norm": 1.5676519870758057, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8839553594589233, + "num_tokens": 845923739.0, + "step": 22174 + }, + { + "epoch": 2.820887927744562, + "grad_norm": 1.3896527290344238, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8957456350326538, + "num_tokens": 845968550.0, + "step": 22175 + }, + { + "epoch": 2.821015138023152, + "grad_norm": 1.5866026878356934, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8850958347320557, + "num_tokens": 846003705.0, + "step": 22176 + }, + { + "epoch": 2.821142348301743, + "grad_norm": 1.5601898431777954, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8828510046005249, + "num_tokens": 846044341.0, + "step": 22177 + }, + { + "epoch": 2.821269558580333, + "grad_norm": 1.657991647720337, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8925960659980774, + "num_tokens": 846076102.0, + "step": 22178 + }, + { + "epoch": 2.8213967688589237, + "grad_norm": 1.6108795404434204, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.88951575756073, + "num_tokens": 846116268.0, + "step": 22179 + }, + { + "epoch": 2.821523979137514, + "grad_norm": 1.4933199882507324, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8899313807487488, + "num_tokens": 846155854.0, + "step": 22180 + }, + { + "epoch": 2.8216511894161047, + "grad_norm": 1.4183087348937988, + "learning_rate": 1e-06, + "loss": 0.2806, + "mean_token_accuracy": 0.8982705473899841, + "num_tokens": 846198271.0, + "step": 22181 + }, + { + "epoch": 2.8217783996946952, + "grad_norm": 1.5522843599319458, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8858381509780884, + "num_tokens": 846236433.0, + "step": 22182 + }, + { + "epoch": 2.8219056099732858, + "grad_norm": 1.6149473190307617, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.8938167691230774, + "num_tokens": 846274561.0, + "step": 22183 + }, + { + "epoch": 2.8220328202518763, + "grad_norm": 1.8203012943267822, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8864514827728271, + "num_tokens": 846308910.0, + "step": 22184 + }, + { + "epoch": 2.822160030530467, + "grad_norm": 1.7887115478515625, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8882272243499756, + "num_tokens": 846339432.0, + "step": 22185 + }, + { + "epoch": 2.8222872408090574, + "grad_norm": 1.4632517099380493, + "learning_rate": 1e-06, + "loss": 0.2754, + "mean_token_accuracy": 0.898589015007019, + "num_tokens": 846377681.0, + "step": 22186 + }, + { + "epoch": 2.822414451087648, + "grad_norm": 1.6225271224975586, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8817582130432129, + "num_tokens": 846411733.0, + "step": 22187 + }, + { + "epoch": 2.8225416613662384, + "grad_norm": 1.4779778718948364, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8921530246734619, + "num_tokens": 846449208.0, + "step": 22188 + }, + { + "epoch": 2.822668871644829, + "grad_norm": 1.4780882596969604, + "learning_rate": 1e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.8993679285049438, + "num_tokens": 846488070.0, + "step": 22189 + }, + { + "epoch": 2.8227960819234195, + "grad_norm": 1.7864227294921875, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8908852934837341, + "num_tokens": 846527289.0, + "step": 22190 + }, + { + "epoch": 2.82292329220201, + "grad_norm": 1.4366377592086792, + "learning_rate": 1e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.8988815546035767, + "num_tokens": 846567469.0, + "step": 22191 + }, + { + "epoch": 2.8230505024806005, + "grad_norm": 1.556577444076538, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8874980807304382, + "num_tokens": 846604960.0, + "step": 22192 + }, + { + "epoch": 2.823177712759191, + "grad_norm": 1.6907974481582642, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8874872326850891, + "num_tokens": 846638323.0, + "step": 22193 + }, + { + "epoch": 2.8233049230377816, + "grad_norm": 1.4447333812713623, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8875768184661865, + "num_tokens": 846679084.0, + "step": 22194 + }, + { + "epoch": 2.823432133316372, + "grad_norm": 1.574336051940918, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8713068962097168, + "num_tokens": 846718722.0, + "step": 22195 + }, + { + "epoch": 2.8235593435949626, + "grad_norm": 1.4407522678375244, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8861801624298096, + "num_tokens": 846759124.0, + "step": 22196 + }, + { + "epoch": 2.8236865538735527, + "grad_norm": 1.5215834379196167, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8828476667404175, + "num_tokens": 846799752.0, + "step": 22197 + }, + { + "epoch": 2.8238137641521437, + "grad_norm": 1.7801164388656616, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.881972074508667, + "num_tokens": 846830045.0, + "step": 22198 + }, + { + "epoch": 2.8239409744307338, + "grad_norm": 1.5562881231307983, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8732196092605591, + "num_tokens": 846869093.0, + "step": 22199 + }, + { + "epoch": 2.8240681847093247, + "grad_norm": 1.6544111967086792, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8906756639480591, + "num_tokens": 846906970.0, + "step": 22200 + }, + { + "epoch": 2.824195394987915, + "grad_norm": 1.5390610694885254, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.893578052520752, + "num_tokens": 846943922.0, + "step": 22201 + }, + { + "epoch": 2.824322605266506, + "grad_norm": 1.5920946598052979, + "learning_rate": 1e-06, + "loss": 0.2684, + "mean_token_accuracy": 0.9010800123214722, + "num_tokens": 846975681.0, + "step": 22202 + }, + { + "epoch": 2.824449815545096, + "grad_norm": 1.4832024574279785, + "learning_rate": 1e-06, + "loss": 0.2788, + "mean_token_accuracy": 0.8998105525970459, + "num_tokens": 847013308.0, + "step": 22203 + }, + { + "epoch": 2.8245770258236864, + "grad_norm": 1.475480318069458, + "learning_rate": 1e-06, + "loss": 0.288, + "mean_token_accuracy": 0.8948456048965454, + "num_tokens": 847053685.0, + "step": 22204 + }, + { + "epoch": 2.824704236102277, + "grad_norm": 1.5951118469238281, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8936688899993896, + "num_tokens": 847093757.0, + "step": 22205 + }, + { + "epoch": 2.8248314463808675, + "grad_norm": 1.7166770696640015, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.883911669254303, + "num_tokens": 847127899.0, + "step": 22206 + }, + { + "epoch": 2.824958656659458, + "grad_norm": 1.4733867645263672, + "learning_rate": 1e-06, + "loss": 0.2749, + "mean_token_accuracy": 0.8998191952705383, + "num_tokens": 847162625.0, + "step": 22207 + }, + { + "epoch": 2.8250858669380485, + "grad_norm": 1.4940181970596313, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8909472227096558, + "num_tokens": 847202083.0, + "step": 22208 + }, + { + "epoch": 2.825213077216639, + "grad_norm": 1.5292935371398926, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8765740990638733, + "num_tokens": 847242720.0, + "step": 22209 + }, + { + "epoch": 2.8253402874952296, + "grad_norm": 1.5708894729614258, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8907449245452881, + "num_tokens": 847280250.0, + "step": 22210 + }, + { + "epoch": 2.82546749777382, + "grad_norm": 1.3955159187316895, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8888832330703735, + "num_tokens": 847322891.0, + "step": 22211 + }, + { + "epoch": 2.8255947080524106, + "grad_norm": 1.554640293121338, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8883461356163025, + "num_tokens": 847362134.0, + "step": 22212 + }, + { + "epoch": 2.825721918331001, + "grad_norm": 1.4673924446105957, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8874704837799072, + "num_tokens": 847405271.0, + "step": 22213 + }, + { + "epoch": 2.8258491286095917, + "grad_norm": 1.453945279121399, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8823280334472656, + "num_tokens": 847447156.0, + "step": 22214 + }, + { + "epoch": 2.8259763388881822, + "grad_norm": 1.4467778205871582, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8891562819480896, + "num_tokens": 847486868.0, + "step": 22215 + }, + { + "epoch": 2.8261035491667728, + "grad_norm": 1.6980900764465332, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8802576065063477, + "num_tokens": 847521453.0, + "step": 22216 + }, + { + "epoch": 2.8262307594453633, + "grad_norm": 1.4499363899230957, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8937923908233643, + "num_tokens": 847562873.0, + "step": 22217 + }, + { + "epoch": 2.826357969723954, + "grad_norm": 1.5669701099395752, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8878716230392456, + "num_tokens": 847600312.0, + "step": 22218 + }, + { + "epoch": 2.8264851800025443, + "grad_norm": 1.6241942644119263, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8796309232711792, + "num_tokens": 847639800.0, + "step": 22219 + }, + { + "epoch": 2.826612390281135, + "grad_norm": 1.421849012374878, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.894378662109375, + "num_tokens": 847683071.0, + "step": 22220 + }, + { + "epoch": 2.8267396005597254, + "grad_norm": 1.629603385925293, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.888153076171875, + "num_tokens": 847720783.0, + "step": 22221 + }, + { + "epoch": 2.8268668108383155, + "grad_norm": 1.5452115535736084, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8896260261535645, + "num_tokens": 847755567.0, + "step": 22222 + }, + { + "epoch": 2.8269940211169065, + "grad_norm": 1.5065325498580933, + "learning_rate": 1e-06, + "loss": 0.2754, + "mean_token_accuracy": 0.899979829788208, + "num_tokens": 847791927.0, + "step": 22223 + }, + { + "epoch": 2.8271212313954965, + "grad_norm": 1.454856276512146, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8887258768081665, + "num_tokens": 847832001.0, + "step": 22224 + }, + { + "epoch": 2.8272484416740875, + "grad_norm": 1.6249463558197021, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8897871971130371, + "num_tokens": 847867304.0, + "step": 22225 + }, + { + "epoch": 2.8273756519526776, + "grad_norm": 1.6429523229599, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8950438499450684, + "num_tokens": 847899654.0, + "step": 22226 + }, + { + "epoch": 2.827502862231268, + "grad_norm": 1.5215498208999634, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8898852467536926, + "num_tokens": 847936038.0, + "step": 22227 + }, + { + "epoch": 2.8276300725098586, + "grad_norm": 1.4029738903045654, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8880366683006287, + "num_tokens": 847979016.0, + "step": 22228 + }, + { + "epoch": 2.827757282788449, + "grad_norm": 1.4681488275527954, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8896550536155701, + "num_tokens": 848016945.0, + "step": 22229 + }, + { + "epoch": 2.8278844930670397, + "grad_norm": 1.5478543043136597, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8851901292800903, + "num_tokens": 848054591.0, + "step": 22230 + }, + { + "epoch": 2.8280117033456302, + "grad_norm": 1.6040705442428589, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.883274257183075, + "num_tokens": 848094976.0, + "step": 22231 + }, + { + "epoch": 2.8281389136242208, + "grad_norm": 1.7289422750473022, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8804367780685425, + "num_tokens": 848131916.0, + "step": 22232 + }, + { + "epoch": 2.8282661239028113, + "grad_norm": 1.6629761457443237, + "learning_rate": 1e-06, + "loss": 0.2696, + "mean_token_accuracy": 0.9024534225463867, + "num_tokens": 848160606.0, + "step": 22233 + }, + { + "epoch": 2.828393334181402, + "grad_norm": 1.565531611442566, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8852931261062622, + "num_tokens": 848200874.0, + "step": 22234 + }, + { + "epoch": 2.8285205444599923, + "grad_norm": 1.493780493736267, + "learning_rate": 1e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.8996425867080688, + "num_tokens": 848238964.0, + "step": 22235 + }, + { + "epoch": 2.828647754738583, + "grad_norm": 1.5598434209823608, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8862450122833252, + "num_tokens": 848276212.0, + "step": 22236 + }, + { + "epoch": 2.8287749650171734, + "grad_norm": 1.9543052911758423, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.878666877746582, + "num_tokens": 848316314.0, + "step": 22237 + }, + { + "epoch": 2.828902175295764, + "grad_norm": 1.6046494245529175, + "learning_rate": 1e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.8948239684104919, + "num_tokens": 848350562.0, + "step": 22238 + }, + { + "epoch": 2.8290293855743545, + "grad_norm": 1.5890949964523315, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8742828369140625, + "num_tokens": 848391457.0, + "step": 22239 + }, + { + "epoch": 2.829156595852945, + "grad_norm": 1.4762158393859863, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8892834782600403, + "num_tokens": 848432779.0, + "step": 22240 + }, + { + "epoch": 2.8292838061315355, + "grad_norm": 1.6168241500854492, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8841838240623474, + "num_tokens": 848467821.0, + "step": 22241 + }, + { + "epoch": 2.829411016410126, + "grad_norm": 1.4407404661178589, + "learning_rate": 1e-06, + "loss": 0.2823, + "mean_token_accuracy": 0.8981904983520508, + "num_tokens": 848507874.0, + "step": 22242 + }, + { + "epoch": 2.8295382266887166, + "grad_norm": 1.5135142803192139, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8917831182479858, + "num_tokens": 848544287.0, + "step": 22243 + }, + { + "epoch": 2.829665436967307, + "grad_norm": 1.6446588039398193, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8936448693275452, + "num_tokens": 848577303.0, + "step": 22244 + }, + { + "epoch": 2.8297926472458976, + "grad_norm": 1.4416767358779907, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.893953263759613, + "num_tokens": 848615483.0, + "step": 22245 + }, + { + "epoch": 2.829919857524488, + "grad_norm": 1.4305986166000366, + "learning_rate": 1e-06, + "loss": 0.2754, + "mean_token_accuracy": 0.9007846117019653, + "num_tokens": 848655908.0, + "step": 22246 + }, + { + "epoch": 2.8300470678030782, + "grad_norm": 1.5300278663635254, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8828372359275818, + "num_tokens": 848700065.0, + "step": 22247 + }, + { + "epoch": 2.830174278081669, + "grad_norm": 1.6318981647491455, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8873469233512878, + "num_tokens": 848735023.0, + "step": 22248 + }, + { + "epoch": 2.8303014883602593, + "grad_norm": 1.6488549709320068, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8795993328094482, + "num_tokens": 848773235.0, + "step": 22249 + }, + { + "epoch": 2.8304286986388503, + "grad_norm": 1.459952473640442, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.888298749923706, + "num_tokens": 848814617.0, + "step": 22250 + }, + { + "epoch": 2.8305559089174404, + "grad_norm": 1.577561378479004, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8868211507797241, + "num_tokens": 848851539.0, + "step": 22251 + }, + { + "epoch": 2.830683119196031, + "grad_norm": 1.6000316143035889, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8931287527084351, + "num_tokens": 848885659.0, + "step": 22252 + }, + { + "epoch": 2.8308103294746214, + "grad_norm": 1.5811461210250854, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.87046879529953, + "num_tokens": 848924341.0, + "step": 22253 + }, + { + "epoch": 2.830937539753212, + "grad_norm": 1.5907944440841675, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.887053906917572, + "num_tokens": 848958824.0, + "step": 22254 + }, + { + "epoch": 2.8310647500318025, + "grad_norm": 1.5309211015701294, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8856559991836548, + "num_tokens": 848998432.0, + "step": 22255 + }, + { + "epoch": 2.831191960310393, + "grad_norm": 1.594692349433899, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8999029994010925, + "num_tokens": 849034738.0, + "step": 22256 + }, + { + "epoch": 2.8313191705889835, + "grad_norm": 1.5085508823394775, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.88603276014328, + "num_tokens": 849072148.0, + "step": 22257 + }, + { + "epoch": 2.831446380867574, + "grad_norm": 1.5344187021255493, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.8973451256752014, + "num_tokens": 849107946.0, + "step": 22258 + }, + { + "epoch": 2.8315735911461646, + "grad_norm": 1.5856176614761353, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8823469877243042, + "num_tokens": 849143024.0, + "step": 22259 + }, + { + "epoch": 2.831700801424755, + "grad_norm": 1.4327812194824219, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8827515840530396, + "num_tokens": 849186545.0, + "step": 22260 + }, + { + "epoch": 2.8318280117033456, + "grad_norm": 1.445151448249817, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8947705030441284, + "num_tokens": 849228075.0, + "step": 22261 + }, + { + "epoch": 2.831955221981936, + "grad_norm": 1.5621551275253296, + "learning_rate": 1e-06, + "loss": 0.279, + "mean_token_accuracy": 0.8993090391159058, + "num_tokens": 849264088.0, + "step": 22262 + }, + { + "epoch": 2.8320824322605267, + "grad_norm": 1.692264437675476, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8906539678573608, + "num_tokens": 849296719.0, + "step": 22263 + }, + { + "epoch": 2.8322096425391172, + "grad_norm": 1.5375608205795288, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8813102841377258, + "num_tokens": 849336358.0, + "step": 22264 + }, + { + "epoch": 2.8323368528177078, + "grad_norm": 1.5716511011123657, + "learning_rate": 1e-06, + "loss": 0.284, + "mean_token_accuracy": 0.8984441757202148, + "num_tokens": 849371471.0, + "step": 22265 + }, + { + "epoch": 2.8324640630962983, + "grad_norm": 1.6486788988113403, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8809323906898499, + "num_tokens": 849408122.0, + "step": 22266 + }, + { + "epoch": 2.832591273374889, + "grad_norm": 1.4733091592788696, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8812718391418457, + "num_tokens": 849446503.0, + "step": 22267 + }, + { + "epoch": 2.8327184836534793, + "grad_norm": 1.4774481058120728, + "learning_rate": 1e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.897293746471405, + "num_tokens": 849484609.0, + "step": 22268 + }, + { + "epoch": 2.83284569393207, + "grad_norm": 1.4772522449493408, + "learning_rate": 1e-06, + "loss": 0.264, + "mean_token_accuracy": 0.9054814577102661, + "num_tokens": 849522039.0, + "step": 22269 + }, + { + "epoch": 2.83297290421066, + "grad_norm": 1.7046819925308228, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.876846432685852, + "num_tokens": 849557117.0, + "step": 22270 + }, + { + "epoch": 2.833100114489251, + "grad_norm": 1.7748359441757202, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8613045811653137, + "num_tokens": 849596011.0, + "step": 22271 + }, + { + "epoch": 2.833227324767841, + "grad_norm": 1.651202917098999, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8728276491165161, + "num_tokens": 849636444.0, + "step": 22272 + }, + { + "epoch": 2.833354535046432, + "grad_norm": 1.5738768577575684, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8872929811477661, + "num_tokens": 849677574.0, + "step": 22273 + }, + { + "epoch": 2.833481745325022, + "grad_norm": 1.514642357826233, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8775785565376282, + "num_tokens": 849717754.0, + "step": 22274 + }, + { + "epoch": 2.833608955603613, + "grad_norm": 1.451198935508728, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8736584186553955, + "num_tokens": 849761787.0, + "step": 22275 + }, + { + "epoch": 2.833736165882203, + "grad_norm": 1.747544765472412, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8869180083274841, + "num_tokens": 849791538.0, + "step": 22276 + }, + { + "epoch": 2.8338633761607936, + "grad_norm": 1.4636666774749756, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8875854015350342, + "num_tokens": 849833748.0, + "step": 22277 + }, + { + "epoch": 2.833990586439384, + "grad_norm": 1.6645677089691162, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8854370713233948, + "num_tokens": 849868294.0, + "step": 22278 + }, + { + "epoch": 2.8341177967179747, + "grad_norm": 1.6052777767181396, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8899891376495361, + "num_tokens": 849903313.0, + "step": 22279 + }, + { + "epoch": 2.8342450069965652, + "grad_norm": 1.6108254194259644, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8787850737571716, + "num_tokens": 849942775.0, + "step": 22280 + }, + { + "epoch": 2.8343722172751558, + "grad_norm": 1.4879887104034424, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8854451775550842, + "num_tokens": 849981801.0, + "step": 22281 + }, + { + "epoch": 2.8344994275537463, + "grad_norm": 1.6238231658935547, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8942943215370178, + "num_tokens": 850013352.0, + "step": 22282 + }, + { + "epoch": 2.834626637832337, + "grad_norm": 1.5320628881454468, + "learning_rate": 1e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.8967123031616211, + "num_tokens": 850048036.0, + "step": 22283 + }, + { + "epoch": 2.8347538481109273, + "grad_norm": 1.4461156129837036, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8675806522369385, + "num_tokens": 850094853.0, + "step": 22284 + }, + { + "epoch": 2.834881058389518, + "grad_norm": 1.4135382175445557, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8877371549606323, + "num_tokens": 850140010.0, + "step": 22285 + }, + { + "epoch": 2.8350082686681084, + "grad_norm": 1.4664429426193237, + "learning_rate": 1e-06, + "loss": 0.2703, + "mean_token_accuracy": 0.9017901420593262, + "num_tokens": 850182423.0, + "step": 22286 + }, + { + "epoch": 2.835135478946699, + "grad_norm": 1.6283469200134277, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8870995044708252, + "num_tokens": 850216375.0, + "step": 22287 + }, + { + "epoch": 2.8352626892252895, + "grad_norm": 1.5159333944320679, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8841322660446167, + "num_tokens": 850259133.0, + "step": 22288 + }, + { + "epoch": 2.83538989950388, + "grad_norm": 1.5612208843231201, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8906522989273071, + "num_tokens": 850300873.0, + "step": 22289 + }, + { + "epoch": 2.8355171097824705, + "grad_norm": 1.5527825355529785, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8880624771118164, + "num_tokens": 850339365.0, + "step": 22290 + }, + { + "epoch": 2.835644320061061, + "grad_norm": 1.615900993347168, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8912459015846252, + "num_tokens": 850378242.0, + "step": 22291 + }, + { + "epoch": 2.8357715303396516, + "grad_norm": 1.5343704223632812, + "learning_rate": 1e-06, + "loss": 0.2739, + "mean_token_accuracy": 0.9003734588623047, + "num_tokens": 850411564.0, + "step": 22292 + }, + { + "epoch": 2.835898740618242, + "grad_norm": 1.5604525804519653, + "learning_rate": 1e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.8945223689079285, + "num_tokens": 850447111.0, + "step": 22293 + }, + { + "epoch": 2.8360259508968326, + "grad_norm": 1.673304557800293, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8885069489479065, + "num_tokens": 850479191.0, + "step": 22294 + }, + { + "epoch": 2.8361531611754227, + "grad_norm": 1.5551973581314087, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8865517377853394, + "num_tokens": 850517701.0, + "step": 22295 + }, + { + "epoch": 2.8362803714540137, + "grad_norm": 1.5844285488128662, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.883124828338623, + "num_tokens": 850556106.0, + "step": 22296 + }, + { + "epoch": 2.8364075817326038, + "grad_norm": 1.6157599687576294, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8933867812156677, + "num_tokens": 850591916.0, + "step": 22297 + }, + { + "epoch": 2.8365347920111947, + "grad_norm": 1.6539933681488037, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8907873034477234, + "num_tokens": 850626520.0, + "step": 22298 + }, + { + "epoch": 2.836662002289785, + "grad_norm": 1.6517022848129272, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8837946653366089, + "num_tokens": 850662119.0, + "step": 22299 + }, + { + "epoch": 2.836789212568376, + "grad_norm": 1.5639783143997192, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8883006572723389, + "num_tokens": 850697793.0, + "step": 22300 + }, + { + "epoch": 2.836916422846966, + "grad_norm": 1.7227553129196167, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8778210282325745, + "num_tokens": 850733390.0, + "step": 22301 + }, + { + "epoch": 2.8370436331255564, + "grad_norm": 1.5564987659454346, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8907034397125244, + "num_tokens": 850774706.0, + "step": 22302 + }, + { + "epoch": 2.837170843404147, + "grad_norm": 1.679024338722229, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8655031323432922, + "num_tokens": 850814528.0, + "step": 22303 + }, + { + "epoch": 2.8372980536827375, + "grad_norm": 1.7260226011276245, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8858909606933594, + "num_tokens": 850845551.0, + "step": 22304 + }, + { + "epoch": 2.837425263961328, + "grad_norm": 1.5715892314910889, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8846562504768372, + "num_tokens": 850883943.0, + "step": 22305 + }, + { + "epoch": 2.8375524742399185, + "grad_norm": 1.4685906171798706, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8762465715408325, + "num_tokens": 850929194.0, + "step": 22306 + }, + { + "epoch": 2.837679684518509, + "grad_norm": 1.6519322395324707, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8807556629180908, + "num_tokens": 850962703.0, + "step": 22307 + }, + { + "epoch": 2.8378068947970996, + "grad_norm": 1.5923569202423096, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8840658068656921, + "num_tokens": 851000809.0, + "step": 22308 + }, + { + "epoch": 2.83793410507569, + "grad_norm": 1.6954948902130127, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8884038925170898, + "num_tokens": 851032777.0, + "step": 22309 + }, + { + "epoch": 2.8380613153542806, + "grad_norm": 1.6073076725006104, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.885291576385498, + "num_tokens": 851069112.0, + "step": 22310 + }, + { + "epoch": 2.838188525632871, + "grad_norm": 1.4456273317337036, + "learning_rate": 1e-06, + "loss": 0.27, + "mean_token_accuracy": 0.9004074335098267, + "num_tokens": 851109040.0, + "step": 22311 + }, + { + "epoch": 2.8383157359114617, + "grad_norm": 1.593217372894287, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8882732391357422, + "num_tokens": 851146609.0, + "step": 22312 + }, + { + "epoch": 2.838442946190052, + "grad_norm": 1.508934736251831, + "learning_rate": 1e-06, + "loss": 0.2909, + "mean_token_accuracy": 0.893316388130188, + "num_tokens": 851182722.0, + "step": 22313 + }, + { + "epoch": 2.8385701564686427, + "grad_norm": 1.6431281566619873, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8925873041152954, + "num_tokens": 851218027.0, + "step": 22314 + }, + { + "epoch": 2.8386973667472333, + "grad_norm": 1.4456970691680908, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8754258155822754, + "num_tokens": 851264563.0, + "step": 22315 + }, + { + "epoch": 2.838824577025824, + "grad_norm": 1.4508767127990723, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8868009448051453, + "num_tokens": 851305086.0, + "step": 22316 + }, + { + "epoch": 2.8389517873044143, + "grad_norm": 1.6785696744918823, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8732256889343262, + "num_tokens": 851339924.0, + "step": 22317 + }, + { + "epoch": 2.839078997583005, + "grad_norm": 1.466278314590454, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8803412318229675, + "num_tokens": 851381401.0, + "step": 22318 + }, + { + "epoch": 2.8392062078615954, + "grad_norm": 1.4926177263259888, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8904460668563843, + "num_tokens": 851420273.0, + "step": 22319 + }, + { + "epoch": 2.8393334181401855, + "grad_norm": 1.6851122379302979, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8792728185653687, + "num_tokens": 851451570.0, + "step": 22320 + }, + { + "epoch": 2.8394606284187764, + "grad_norm": 1.5335628986358643, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8818694949150085, + "num_tokens": 851489475.0, + "step": 22321 + }, + { + "epoch": 2.8395878386973665, + "grad_norm": 1.3661397695541382, + "learning_rate": 1e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.8975685834884644, + "num_tokens": 851533025.0, + "step": 22322 + }, + { + "epoch": 2.8397150489759575, + "grad_norm": 1.7186553478240967, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.877138614654541, + "num_tokens": 851568292.0, + "step": 22323 + }, + { + "epoch": 2.8398422592545476, + "grad_norm": 1.5248104333877563, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8852524161338806, + "num_tokens": 851606361.0, + "step": 22324 + }, + { + "epoch": 2.839969469533138, + "grad_norm": 1.4063291549682617, + "learning_rate": 1e-06, + "loss": 0.2884, + "mean_token_accuracy": 0.8964356184005737, + "num_tokens": 851647881.0, + "step": 22325 + }, + { + "epoch": 2.8400966798117286, + "grad_norm": 1.5981842279434204, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.891971230506897, + "num_tokens": 851683298.0, + "step": 22326 + }, + { + "epoch": 2.840223890090319, + "grad_norm": 1.440102219581604, + "learning_rate": 1e-06, + "loss": 0.279, + "mean_token_accuracy": 0.8975814580917358, + "num_tokens": 851721640.0, + "step": 22327 + }, + { + "epoch": 2.8403511003689097, + "grad_norm": 1.6156930923461914, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8904876708984375, + "num_tokens": 851755380.0, + "step": 22328 + }, + { + "epoch": 2.8404783106475002, + "grad_norm": 1.6396361589431763, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8769732117652893, + "num_tokens": 851793148.0, + "step": 22329 + }, + { + "epoch": 2.8406055209260908, + "grad_norm": 1.4291422367095947, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.887170672416687, + "num_tokens": 851833446.0, + "step": 22330 + }, + { + "epoch": 2.8407327312046813, + "grad_norm": 1.52420175075531, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8851211071014404, + "num_tokens": 851874468.0, + "step": 22331 + }, + { + "epoch": 2.840859941483272, + "grad_norm": 1.6099145412445068, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8876582384109497, + "num_tokens": 851911235.0, + "step": 22332 + }, + { + "epoch": 2.8409871517618623, + "grad_norm": 1.7574679851531982, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8803343772888184, + "num_tokens": 851946007.0, + "step": 22333 + }, + { + "epoch": 2.841114362040453, + "grad_norm": 1.5502640008926392, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8916370868682861, + "num_tokens": 851984363.0, + "step": 22334 + }, + { + "epoch": 2.8412415723190434, + "grad_norm": 1.523886799812317, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8832500576972961, + "num_tokens": 852023433.0, + "step": 22335 + }, + { + "epoch": 2.841368782597634, + "grad_norm": 1.6061497926712036, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8939140439033508, + "num_tokens": 852057764.0, + "step": 22336 + }, + { + "epoch": 2.8414959928762245, + "grad_norm": 1.641802191734314, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.877078115940094, + "num_tokens": 852097661.0, + "step": 22337 + }, + { + "epoch": 2.841623203154815, + "grad_norm": 1.6620124578475952, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8692770004272461, + "num_tokens": 852140771.0, + "step": 22338 + }, + { + "epoch": 2.8417504134334055, + "grad_norm": 1.7583916187286377, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.874054491519928, + "num_tokens": 852178487.0, + "step": 22339 + }, + { + "epoch": 2.841877623711996, + "grad_norm": 1.5977951288223267, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8829888701438904, + "num_tokens": 852214360.0, + "step": 22340 + }, + { + "epoch": 2.8420048339905866, + "grad_norm": 1.54136323928833, + "learning_rate": 1e-06, + "loss": 0.2712, + "mean_token_accuracy": 0.8993738293647766, + "num_tokens": 852251456.0, + "step": 22341 + }, + { + "epoch": 2.842132044269177, + "grad_norm": 1.4957592487335205, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8882509469985962, + "num_tokens": 852292298.0, + "step": 22342 + }, + { + "epoch": 2.8422592545477676, + "grad_norm": 1.653845191001892, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8845224976539612, + "num_tokens": 852331864.0, + "step": 22343 + }, + { + "epoch": 2.842386464826358, + "grad_norm": 1.4648953676223755, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8959402441978455, + "num_tokens": 852370448.0, + "step": 22344 + }, + { + "epoch": 2.8425136751049482, + "grad_norm": 1.5827072858810425, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8781452178955078, + "num_tokens": 852409964.0, + "step": 22345 + }, + { + "epoch": 2.842640885383539, + "grad_norm": 1.6294260025024414, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8724827170372009, + "num_tokens": 852447287.0, + "step": 22346 + }, + { + "epoch": 2.8427680956621293, + "grad_norm": 1.5903935432434082, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8847600817680359, + "num_tokens": 852482610.0, + "step": 22347 + }, + { + "epoch": 2.8428953059407203, + "grad_norm": 1.5982012748718262, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8817024230957031, + "num_tokens": 852519805.0, + "step": 22348 + }, + { + "epoch": 2.8430225162193103, + "grad_norm": 1.3904505968093872, + "learning_rate": 1e-06, + "loss": 0.2763, + "mean_token_accuracy": 0.9008657932281494, + "num_tokens": 852562069.0, + "step": 22349 + }, + { + "epoch": 2.843149726497901, + "grad_norm": 1.4765551090240479, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8822600245475769, + "num_tokens": 852604808.0, + "step": 22350 + }, + { + "epoch": 2.8432769367764914, + "grad_norm": 1.6427946090698242, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.871035635471344, + "num_tokens": 852641601.0, + "step": 22351 + }, + { + "epoch": 2.843404147055082, + "grad_norm": 1.5400376319885254, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8872394561767578, + "num_tokens": 852682000.0, + "step": 22352 + }, + { + "epoch": 2.8435313573336725, + "grad_norm": 1.5456578731536865, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8811832070350647, + "num_tokens": 852722582.0, + "step": 22353 + }, + { + "epoch": 2.843658567612263, + "grad_norm": 1.6003354787826538, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8810907602310181, + "num_tokens": 852763397.0, + "step": 22354 + }, + { + "epoch": 2.8437857778908535, + "grad_norm": 1.5873662233352661, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8778833746910095, + "num_tokens": 852802345.0, + "step": 22355 + }, + { + "epoch": 2.843912988169444, + "grad_norm": 1.5635930299758911, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8759717345237732, + "num_tokens": 852839927.0, + "step": 22356 + }, + { + "epoch": 2.8440401984480346, + "grad_norm": 1.5853228569030762, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.876655101776123, + "num_tokens": 852881639.0, + "step": 22357 + }, + { + "epoch": 2.844167408726625, + "grad_norm": 1.4841569662094116, + "learning_rate": 1e-06, + "loss": 0.2624, + "mean_token_accuracy": 0.9034602642059326, + "num_tokens": 852916725.0, + "step": 22358 + }, + { + "epoch": 2.8442946190052156, + "grad_norm": 1.5882681608200073, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8818755149841309, + "num_tokens": 852954326.0, + "step": 22359 + }, + { + "epoch": 2.844421829283806, + "grad_norm": 1.5054856538772583, + "learning_rate": 1e-06, + "loss": 0.2681, + "mean_token_accuracy": 0.9038065671920776, + "num_tokens": 852991237.0, + "step": 22360 + }, + { + "epoch": 2.8445490395623967, + "grad_norm": 1.8580657243728638, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8737186193466187, + "num_tokens": 853020517.0, + "step": 22361 + }, + { + "epoch": 2.844676249840987, + "grad_norm": 1.4987382888793945, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8865494132041931, + "num_tokens": 853064329.0, + "step": 22362 + }, + { + "epoch": 2.8448034601195777, + "grad_norm": 1.6834945678710938, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8737738132476807, + "num_tokens": 853099171.0, + "step": 22363 + }, + { + "epoch": 2.8449306703981683, + "grad_norm": 1.445259928703308, + "learning_rate": 1e-06, + "loss": 0.2716, + "mean_token_accuracy": 0.8998347520828247, + "num_tokens": 853136437.0, + "step": 22364 + }, + { + "epoch": 2.845057880676759, + "grad_norm": 1.610091209411621, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8884398937225342, + "num_tokens": 853170078.0, + "step": 22365 + }, + { + "epoch": 2.8451850909553493, + "grad_norm": 1.4623167514801025, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8852857947349548, + "num_tokens": 853213213.0, + "step": 22366 + }, + { + "epoch": 2.84531230123394, + "grad_norm": 1.4455046653747559, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8866446018218994, + "num_tokens": 853258201.0, + "step": 22367 + }, + { + "epoch": 2.84543951151253, + "grad_norm": 1.4519765377044678, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8894443511962891, + "num_tokens": 853298436.0, + "step": 22368 + }, + { + "epoch": 2.845566721791121, + "grad_norm": 1.5273760557174683, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8882389068603516, + "num_tokens": 853338691.0, + "step": 22369 + }, + { + "epoch": 2.845693932069711, + "grad_norm": 1.5685498714447021, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8807448148727417, + "num_tokens": 853378998.0, + "step": 22370 + }, + { + "epoch": 2.845821142348302, + "grad_norm": 1.4517117738723755, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8762694001197815, + "num_tokens": 853422736.0, + "step": 22371 + }, + { + "epoch": 2.845948352626892, + "grad_norm": 1.5361727476119995, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8834843039512634, + "num_tokens": 853462468.0, + "step": 22372 + }, + { + "epoch": 2.846075562905483, + "grad_norm": 1.7319483757019043, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8846975564956665, + "num_tokens": 853493067.0, + "step": 22373 + }, + { + "epoch": 2.846202773184073, + "grad_norm": 1.4773284196853638, + "learning_rate": 1e-06, + "loss": 0.2635, + "mean_token_accuracy": 0.9025888442993164, + "num_tokens": 853531526.0, + "step": 22374 + }, + { + "epoch": 2.8463299834626636, + "grad_norm": 1.5014963150024414, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8802670836448669, + "num_tokens": 853572567.0, + "step": 22375 + }, + { + "epoch": 2.846457193741254, + "grad_norm": 1.5937880277633667, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8935884237289429, + "num_tokens": 853607311.0, + "step": 22376 + }, + { + "epoch": 2.8465844040198447, + "grad_norm": 1.6799261569976807, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8774756789207458, + "num_tokens": 853644333.0, + "step": 22377 + }, + { + "epoch": 2.8467116142984352, + "grad_norm": 1.5411009788513184, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8888592720031738, + "num_tokens": 853682435.0, + "step": 22378 + }, + { + "epoch": 2.8468388245770258, + "grad_norm": 1.6718896627426147, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8656402230262756, + "num_tokens": 853723860.0, + "step": 22379 + }, + { + "epoch": 2.8469660348556163, + "grad_norm": 1.4391945600509644, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8836103677749634, + "num_tokens": 853766724.0, + "step": 22380 + }, + { + "epoch": 2.847093245134207, + "grad_norm": 1.6600428819656372, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8704937696456909, + "num_tokens": 853804466.0, + "step": 22381 + }, + { + "epoch": 2.8472204554127973, + "grad_norm": 1.5789839029312134, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8858300447463989, + "num_tokens": 853842152.0, + "step": 22382 + }, + { + "epoch": 2.847347665691388, + "grad_norm": 1.6997376680374146, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8875303864479065, + "num_tokens": 853874597.0, + "step": 22383 + }, + { + "epoch": 2.8474748759699784, + "grad_norm": 1.547184705734253, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8756858706474304, + "num_tokens": 853917643.0, + "step": 22384 + }, + { + "epoch": 2.847602086248569, + "grad_norm": 1.538787603378296, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8866145610809326, + "num_tokens": 853957253.0, + "step": 22385 + }, + { + "epoch": 2.8477292965271594, + "grad_norm": 1.5090974569320679, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8861560225486755, + "num_tokens": 853998025.0, + "step": 22386 + }, + { + "epoch": 2.84785650680575, + "grad_norm": 1.5856643915176392, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8784564733505249, + "num_tokens": 854038063.0, + "step": 22387 + }, + { + "epoch": 2.8479837170843405, + "grad_norm": 1.7078349590301514, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8729190230369568, + "num_tokens": 854074293.0, + "step": 22388 + }, + { + "epoch": 2.848110927362931, + "grad_norm": 1.5686794519424438, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.886224627494812, + "num_tokens": 854109079.0, + "step": 22389 + }, + { + "epoch": 2.8482381376415216, + "grad_norm": 1.4806231260299683, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8867242336273193, + "num_tokens": 854150142.0, + "step": 22390 + }, + { + "epoch": 2.848365347920112, + "grad_norm": 1.6028552055358887, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8903812170028687, + "num_tokens": 854186058.0, + "step": 22391 + }, + { + "epoch": 2.8484925581987026, + "grad_norm": 1.694101333618164, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8947153091430664, + "num_tokens": 854217448.0, + "step": 22392 + }, + { + "epoch": 2.8486197684772927, + "grad_norm": 1.5508674383163452, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8703477382659912, + "num_tokens": 854261847.0, + "step": 22393 + }, + { + "epoch": 2.8487469787558837, + "grad_norm": 1.559034824371338, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8853009939193726, + "num_tokens": 854300368.0, + "step": 22394 + }, + { + "epoch": 2.8488741890344738, + "grad_norm": 1.5240375995635986, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8814663887023926, + "num_tokens": 854341059.0, + "step": 22395 + }, + { + "epoch": 2.8490013993130647, + "grad_norm": 1.6517993211746216, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8667975068092346, + "num_tokens": 854379555.0, + "step": 22396 + }, + { + "epoch": 2.849128609591655, + "grad_norm": 1.5029977560043335, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8900348544120789, + "num_tokens": 854415832.0, + "step": 22397 + }, + { + "epoch": 2.849255819870246, + "grad_norm": 1.5151972770690918, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8932781219482422, + "num_tokens": 854453448.0, + "step": 22398 + }, + { + "epoch": 2.849383030148836, + "grad_norm": 1.5681118965148926, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8871465921401978, + "num_tokens": 854488887.0, + "step": 22399 + }, + { + "epoch": 2.8495102404274264, + "grad_norm": 1.5755665302276611, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8808264136314392, + "num_tokens": 854528619.0, + "step": 22400 + }, + { + "epoch": 2.849637450706017, + "grad_norm": 1.4616554975509644, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8783508539199829, + "num_tokens": 854572694.0, + "step": 22401 + }, + { + "epoch": 2.8497646609846075, + "grad_norm": 1.747982382774353, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8860267400741577, + "num_tokens": 854604357.0, + "step": 22402 + }, + { + "epoch": 2.849891871263198, + "grad_norm": 1.559658169746399, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.879499077796936, + "num_tokens": 854647534.0, + "step": 22403 + }, + { + "epoch": 2.8500190815417885, + "grad_norm": 1.442017912864685, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8880209922790527, + "num_tokens": 854690591.0, + "step": 22404 + }, + { + "epoch": 2.850146291820379, + "grad_norm": 1.4285547733306885, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8887333273887634, + "num_tokens": 854732305.0, + "step": 22405 + }, + { + "epoch": 2.8502735020989696, + "grad_norm": 1.633601427078247, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8845372200012207, + "num_tokens": 854766679.0, + "step": 22406 + }, + { + "epoch": 2.85040071237756, + "grad_norm": 1.7319990396499634, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8706600069999695, + "num_tokens": 854804231.0, + "step": 22407 + }, + { + "epoch": 2.8505279226561506, + "grad_norm": 1.5037578344345093, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8768184185028076, + "num_tokens": 854844824.0, + "step": 22408 + }, + { + "epoch": 2.850655132934741, + "grad_norm": 1.5043281316757202, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8788871169090271, + "num_tokens": 854886748.0, + "step": 22409 + }, + { + "epoch": 2.8507823432133317, + "grad_norm": 1.5465623140335083, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8729528784751892, + "num_tokens": 854930203.0, + "step": 22410 + }, + { + "epoch": 2.850909553491922, + "grad_norm": 1.3943268060684204, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8950344920158386, + "num_tokens": 854971439.0, + "step": 22411 + }, + { + "epoch": 2.8510367637705127, + "grad_norm": 1.5402696132659912, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8898835778236389, + "num_tokens": 855006818.0, + "step": 22412 + }, + { + "epoch": 2.8511639740491033, + "grad_norm": 1.7505300045013428, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8855407238006592, + "num_tokens": 855039214.0, + "step": 22413 + }, + { + "epoch": 2.851291184327694, + "grad_norm": 1.5663975477218628, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8850663900375366, + "num_tokens": 855075774.0, + "step": 22414 + }, + { + "epoch": 2.8514183946062843, + "grad_norm": 1.4265594482421875, + "learning_rate": 1e-06, + "loss": 0.2792, + "mean_token_accuracy": 0.9009434580802917, + "num_tokens": 855116085.0, + "step": 22415 + }, + { + "epoch": 2.851545604884875, + "grad_norm": 1.5928115844726562, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8888882398605347, + "num_tokens": 855151858.0, + "step": 22416 + }, + { + "epoch": 2.8516728151634654, + "grad_norm": 1.4182765483856201, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8928118348121643, + "num_tokens": 855196138.0, + "step": 22417 + }, + { + "epoch": 2.8518000254420555, + "grad_norm": 1.5496937036514282, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8979868292808533, + "num_tokens": 855230860.0, + "step": 22418 + }, + { + "epoch": 2.8519272357206464, + "grad_norm": 1.675933837890625, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8866467475891113, + "num_tokens": 855263972.0, + "step": 22419 + }, + { + "epoch": 2.8520544459992365, + "grad_norm": 1.5489540100097656, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.879492998123169, + "num_tokens": 855305035.0, + "step": 22420 + }, + { + "epoch": 2.8521816562778275, + "grad_norm": 1.5460847616195679, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8793314695358276, + "num_tokens": 855343210.0, + "step": 22421 + }, + { + "epoch": 2.8523088665564176, + "grad_norm": 1.5370821952819824, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8816635012626648, + "num_tokens": 855382647.0, + "step": 22422 + }, + { + "epoch": 2.852436076835008, + "grad_norm": 1.574560523033142, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8872317671775818, + "num_tokens": 855420839.0, + "step": 22423 + }, + { + "epoch": 2.8525632871135986, + "grad_norm": 1.5742037296295166, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8836571574211121, + "num_tokens": 855458389.0, + "step": 22424 + }, + { + "epoch": 2.852690497392189, + "grad_norm": 1.6428518295288086, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8930666446685791, + "num_tokens": 855492929.0, + "step": 22425 + }, + { + "epoch": 2.8528177076707797, + "grad_norm": 1.5458484888076782, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.889149010181427, + "num_tokens": 855530588.0, + "step": 22426 + }, + { + "epoch": 2.85294491794937, + "grad_norm": 1.5218327045440674, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8706655502319336, + "num_tokens": 855572739.0, + "step": 22427 + }, + { + "epoch": 2.8530721282279607, + "grad_norm": 1.5135143995285034, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8771594166755676, + "num_tokens": 855615773.0, + "step": 22428 + }, + { + "epoch": 2.8531993385065513, + "grad_norm": 1.5199172496795654, + "learning_rate": 1e-06, + "loss": 0.2836, + "mean_token_accuracy": 0.8985784649848938, + "num_tokens": 855653001.0, + "step": 22429 + }, + { + "epoch": 2.853326548785142, + "grad_norm": 1.5139018297195435, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8848615288734436, + "num_tokens": 855693637.0, + "step": 22430 + }, + { + "epoch": 2.8534537590637323, + "grad_norm": 1.3734468221664429, + "learning_rate": 1e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.894883394241333, + "num_tokens": 855736123.0, + "step": 22431 + }, + { + "epoch": 2.853580969342323, + "grad_norm": 1.604177474975586, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8903608322143555, + "num_tokens": 855770844.0, + "step": 22432 + }, + { + "epoch": 2.8537081796209134, + "grad_norm": 1.5503606796264648, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.892448902130127, + "num_tokens": 855808235.0, + "step": 22433 + }, + { + "epoch": 2.853835389899504, + "grad_norm": 1.472851037979126, + "learning_rate": 1e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.8958686590194702, + "num_tokens": 855848755.0, + "step": 22434 + }, + { + "epoch": 2.8539626001780944, + "grad_norm": 1.6170909404754639, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.873404860496521, + "num_tokens": 855886828.0, + "step": 22435 + }, + { + "epoch": 2.854089810456685, + "grad_norm": 1.5563503503799438, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8739566802978516, + "num_tokens": 855926992.0, + "step": 22436 + }, + { + "epoch": 2.8542170207352755, + "grad_norm": 1.6128050088882446, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8912436962127686, + "num_tokens": 855962477.0, + "step": 22437 + }, + { + "epoch": 2.854344231013866, + "grad_norm": 1.600890874862671, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8809829950332642, + "num_tokens": 856001833.0, + "step": 22438 + }, + { + "epoch": 2.8544714412924566, + "grad_norm": 1.750533103942871, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8826522827148438, + "num_tokens": 856038044.0, + "step": 22439 + }, + { + "epoch": 2.854598651571047, + "grad_norm": 1.5800896883010864, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8907228708267212, + "num_tokens": 856072993.0, + "step": 22440 + }, + { + "epoch": 2.8547258618496376, + "grad_norm": 1.6796904802322388, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8667498826980591, + "num_tokens": 856112417.0, + "step": 22441 + }, + { + "epoch": 2.854853072128228, + "grad_norm": 1.514231562614441, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8748160600662231, + "num_tokens": 856154785.0, + "step": 22442 + }, + { + "epoch": 2.8549802824068182, + "grad_norm": 1.5291491746902466, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.888195276260376, + "num_tokens": 856192705.0, + "step": 22443 + }, + { + "epoch": 2.855107492685409, + "grad_norm": 1.498726725578308, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8958641290664673, + "num_tokens": 856230168.0, + "step": 22444 + }, + { + "epoch": 2.8552347029639993, + "grad_norm": 1.5757681131362915, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8805189728736877, + "num_tokens": 856273064.0, + "step": 22445 + }, + { + "epoch": 2.8553619132425903, + "grad_norm": 1.5116630792617798, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8918348550796509, + "num_tokens": 856313020.0, + "step": 22446 + }, + { + "epoch": 2.8554891235211803, + "grad_norm": 1.4771220684051514, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8868512511253357, + "num_tokens": 856353765.0, + "step": 22447 + }, + { + "epoch": 2.855616333799771, + "grad_norm": 1.5311812162399292, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8937634229660034, + "num_tokens": 856390472.0, + "step": 22448 + }, + { + "epoch": 2.8557435440783614, + "grad_norm": 1.5370959043502808, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8904086351394653, + "num_tokens": 856429718.0, + "step": 22449 + }, + { + "epoch": 2.855870754356952, + "grad_norm": 1.6386618614196777, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.874224066734314, + "num_tokens": 856472344.0, + "step": 22450 + }, + { + "epoch": 2.8559979646355425, + "grad_norm": 1.5191320180892944, + "learning_rate": 1e-06, + "loss": 0.2515, + "mean_token_accuracy": 0.9067273139953613, + "num_tokens": 856509017.0, + "step": 22451 + }, + { + "epoch": 2.856125174914133, + "grad_norm": 1.6743435859680176, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8886672854423523, + "num_tokens": 856541134.0, + "step": 22452 + }, + { + "epoch": 2.8562523851927235, + "grad_norm": 1.561407446861267, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8957918882369995, + "num_tokens": 856577055.0, + "step": 22453 + }, + { + "epoch": 2.856379595471314, + "grad_norm": 1.5784448385238647, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.88896644115448, + "num_tokens": 856614111.0, + "step": 22454 + }, + { + "epoch": 2.8565068057499046, + "grad_norm": 1.5265192985534668, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8850305080413818, + "num_tokens": 856652874.0, + "step": 22455 + }, + { + "epoch": 2.856634016028495, + "grad_norm": 1.5902736186981201, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8871048092842102, + "num_tokens": 856686246.0, + "step": 22456 + }, + { + "epoch": 2.8567612263070856, + "grad_norm": 1.5422831773757935, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.865221381187439, + "num_tokens": 856728481.0, + "step": 22457 + }, + { + "epoch": 2.856888436585676, + "grad_norm": 1.6045217514038086, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8852466940879822, + "num_tokens": 856765770.0, + "step": 22458 + }, + { + "epoch": 2.8570156468642667, + "grad_norm": 1.5584243535995483, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8931323289871216, + "num_tokens": 856801477.0, + "step": 22459 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 1.4297091960906982, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8834041357040405, + "num_tokens": 856846851.0, + "step": 22460 + }, + { + "epoch": 2.8572700674214477, + "grad_norm": 1.4062998294830322, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8919320106506348, + "num_tokens": 856887823.0, + "step": 22461 + }, + { + "epoch": 2.8573972777000383, + "grad_norm": 1.5355244874954224, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8778748512268066, + "num_tokens": 856927373.0, + "step": 22462 + }, + { + "epoch": 2.857524487978629, + "grad_norm": 1.528273344039917, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.892670750617981, + "num_tokens": 856966910.0, + "step": 22463 + }, + { + "epoch": 2.8576516982572193, + "grad_norm": 1.6373212337493896, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8836404085159302, + "num_tokens": 857003599.0, + "step": 22464 + }, + { + "epoch": 2.85777890853581, + "grad_norm": 1.6467101573944092, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.873630166053772, + "num_tokens": 857044617.0, + "step": 22465 + }, + { + "epoch": 2.8579061188144, + "grad_norm": 1.4231528043746948, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8890819549560547, + "num_tokens": 857084697.0, + "step": 22466 + }, + { + "epoch": 2.858033329092991, + "grad_norm": 1.5356357097625732, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8902370929718018, + "num_tokens": 857123019.0, + "step": 22467 + }, + { + "epoch": 2.858160539371581, + "grad_norm": 1.569810152053833, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8916571140289307, + "num_tokens": 857163260.0, + "step": 22468 + }, + { + "epoch": 2.858287749650172, + "grad_norm": 1.4692612886428833, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8936854004859924, + "num_tokens": 857202126.0, + "step": 22469 + }, + { + "epoch": 2.858414959928762, + "grad_norm": 1.49529230594635, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8860114216804504, + "num_tokens": 857242440.0, + "step": 22470 + }, + { + "epoch": 2.858542170207353, + "grad_norm": 1.7212531566619873, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.882405698299408, + "num_tokens": 857284595.0, + "step": 22471 + }, + { + "epoch": 2.858669380485943, + "grad_norm": 1.5639084577560425, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8922109007835388, + "num_tokens": 857320900.0, + "step": 22472 + }, + { + "epoch": 2.8587965907645336, + "grad_norm": 1.5380078554153442, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.87529057264328, + "num_tokens": 857361781.0, + "step": 22473 + }, + { + "epoch": 2.858923801043124, + "grad_norm": 1.4862524271011353, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8719201683998108, + "num_tokens": 857405218.0, + "step": 22474 + }, + { + "epoch": 2.8590510113217147, + "grad_norm": 1.569454550743103, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8848432898521423, + "num_tokens": 857443809.0, + "step": 22475 + }, + { + "epoch": 2.859178221600305, + "grad_norm": 1.7093801498413086, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8730063438415527, + "num_tokens": 857483100.0, + "step": 22476 + }, + { + "epoch": 2.8593054318788957, + "grad_norm": 1.4764565229415894, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8899755477905273, + "num_tokens": 857524056.0, + "step": 22477 + }, + { + "epoch": 2.8594326421574863, + "grad_norm": 1.4747401475906372, + "learning_rate": 1e-06, + "loss": 0.28, + "mean_token_accuracy": 0.8989913463592529, + "num_tokens": 857560337.0, + "step": 22478 + }, + { + "epoch": 2.859559852436077, + "grad_norm": 1.5368353128433228, + "learning_rate": 1e-06, + "loss": 0.2846, + "mean_token_accuracy": 0.8955351114273071, + "num_tokens": 857596118.0, + "step": 22479 + }, + { + "epoch": 2.8596870627146673, + "grad_norm": 1.4677636623382568, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8952655792236328, + "num_tokens": 857635031.0, + "step": 22480 + }, + { + "epoch": 2.859814272993258, + "grad_norm": 1.6099026203155518, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8819403648376465, + "num_tokens": 857673507.0, + "step": 22481 + }, + { + "epoch": 2.8599414832718484, + "grad_norm": 1.6647087335586548, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8782217502593994, + "num_tokens": 857711622.0, + "step": 22482 + }, + { + "epoch": 2.860068693550439, + "grad_norm": 1.5453717708587646, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8877772092819214, + "num_tokens": 857749204.0, + "step": 22483 + }, + { + "epoch": 2.8601959038290294, + "grad_norm": 1.5474581718444824, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8802910447120667, + "num_tokens": 857790929.0, + "step": 22484 + }, + { + "epoch": 2.86032311410762, + "grad_norm": 1.608189582824707, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.8957056999206543, + "num_tokens": 857822531.0, + "step": 22485 + }, + { + "epoch": 2.8604503243862105, + "grad_norm": 1.5532633066177368, + "learning_rate": 1e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.8960102796554565, + "num_tokens": 857862034.0, + "step": 22486 + }, + { + "epoch": 2.860577534664801, + "grad_norm": 1.463032603263855, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8913085460662842, + "num_tokens": 857902439.0, + "step": 22487 + }, + { + "epoch": 2.8607047449433916, + "grad_norm": 1.802423119544983, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8837692737579346, + "num_tokens": 857935760.0, + "step": 22488 + }, + { + "epoch": 2.860831955221982, + "grad_norm": 1.5878429412841797, + "learning_rate": 1e-06, + "loss": 0.2723, + "mean_token_accuracy": 0.9028193950653076, + "num_tokens": 857972891.0, + "step": 22489 + }, + { + "epoch": 2.8609591655005726, + "grad_norm": 1.7060279846191406, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8662363290786743, + "num_tokens": 858008924.0, + "step": 22490 + }, + { + "epoch": 2.8610863757791627, + "grad_norm": 1.5941338539123535, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8870682716369629, + "num_tokens": 858050225.0, + "step": 22491 + }, + { + "epoch": 2.8612135860577537, + "grad_norm": 1.580492615699768, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8666674494743347, + "num_tokens": 858093609.0, + "step": 22492 + }, + { + "epoch": 2.8613407963363438, + "grad_norm": 1.5013668537139893, + "learning_rate": 1e-06, + "loss": 0.2902, + "mean_token_accuracy": 0.8957890272140503, + "num_tokens": 858134293.0, + "step": 22493 + }, + { + "epoch": 2.8614680066149347, + "grad_norm": 1.5687869787216187, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8856372833251953, + "num_tokens": 858170650.0, + "step": 22494 + }, + { + "epoch": 2.861595216893525, + "grad_norm": 1.5598069429397583, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8857576251029968, + "num_tokens": 858213308.0, + "step": 22495 + }, + { + "epoch": 2.861722427172116, + "grad_norm": 1.6342490911483765, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8804386854171753, + "num_tokens": 858251741.0, + "step": 22496 + }, + { + "epoch": 2.861849637450706, + "grad_norm": 1.7158806324005127, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8789284229278564, + "num_tokens": 858290040.0, + "step": 22497 + }, + { + "epoch": 2.8619768477292964, + "grad_norm": 1.684401035308838, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.894763708114624, + "num_tokens": 858320659.0, + "step": 22498 + }, + { + "epoch": 2.862104058007887, + "grad_norm": 1.5696722269058228, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8891500234603882, + "num_tokens": 858358186.0, + "step": 22499 + }, + { + "epoch": 2.8622312682864774, + "grad_norm": 1.45488440990448, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8787854909896851, + "num_tokens": 858399456.0, + "step": 22500 + }, + { + "epoch": 2.862358478565068, + "grad_norm": 1.5986571311950684, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8744587898254395, + "num_tokens": 858441991.0, + "step": 22501 + }, + { + "epoch": 2.8624856888436585, + "grad_norm": 1.5752909183502197, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8921157717704773, + "num_tokens": 858479822.0, + "step": 22502 + }, + { + "epoch": 2.862612899122249, + "grad_norm": 1.5254931449890137, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8760082721710205, + "num_tokens": 858521168.0, + "step": 22503 + }, + { + "epoch": 2.8627401094008396, + "grad_norm": 1.489202618598938, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8844183683395386, + "num_tokens": 858561035.0, + "step": 22504 + }, + { + "epoch": 2.86286731967943, + "grad_norm": 1.5086640119552612, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8786208629608154, + "num_tokens": 858607252.0, + "step": 22505 + }, + { + "epoch": 2.8629945299580206, + "grad_norm": 1.4773356914520264, + "learning_rate": 1e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.8929906487464905, + "num_tokens": 858645401.0, + "step": 22506 + }, + { + "epoch": 2.863121740236611, + "grad_norm": 1.4950275421142578, + "learning_rate": 1e-06, + "loss": 0.295, + "mean_token_accuracy": 0.8921995759010315, + "num_tokens": 858684935.0, + "step": 22507 + }, + { + "epoch": 2.8632489505152017, + "grad_norm": 1.4673758745193481, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8886541128158569, + "num_tokens": 858726838.0, + "step": 22508 + }, + { + "epoch": 2.863376160793792, + "grad_norm": 1.5973197221755981, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8805860877037048, + "num_tokens": 858767260.0, + "step": 22509 + }, + { + "epoch": 2.8635033710723827, + "grad_norm": 1.4635870456695557, + "learning_rate": 1e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.8937855362892151, + "num_tokens": 858806572.0, + "step": 22510 + }, + { + "epoch": 2.8636305813509733, + "grad_norm": 1.5090452432632446, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8757207989692688, + "num_tokens": 858847630.0, + "step": 22511 + }, + { + "epoch": 2.863757791629564, + "grad_norm": 1.4764105081558228, + "learning_rate": 1e-06, + "loss": 0.2598, + "mean_token_accuracy": 0.9051723480224609, + "num_tokens": 858883926.0, + "step": 22512 + }, + { + "epoch": 2.8638850019081543, + "grad_norm": 1.5412873029708862, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.872223436832428, + "num_tokens": 858926924.0, + "step": 22513 + }, + { + "epoch": 2.864012212186745, + "grad_norm": 1.5751783847808838, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8866087198257446, + "num_tokens": 858962609.0, + "step": 22514 + }, + { + "epoch": 2.8641394224653354, + "grad_norm": 1.6525194644927979, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8811759948730469, + "num_tokens": 858997520.0, + "step": 22515 + }, + { + "epoch": 2.8642666327439255, + "grad_norm": 1.5558862686157227, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8940101861953735, + "num_tokens": 859033495.0, + "step": 22516 + }, + { + "epoch": 2.8643938430225164, + "grad_norm": 1.5856335163116455, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.8922339677810669, + "num_tokens": 859068969.0, + "step": 22517 + }, + { + "epoch": 2.8645210533011065, + "grad_norm": 1.6089777946472168, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.87736576795578, + "num_tokens": 859106443.0, + "step": 22518 + }, + { + "epoch": 2.8646482635796975, + "grad_norm": 1.4735556840896606, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8784626722335815, + "num_tokens": 859149813.0, + "step": 22519 + }, + { + "epoch": 2.8647754738582876, + "grad_norm": 1.5438483953475952, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8853650093078613, + "num_tokens": 859192135.0, + "step": 22520 + }, + { + "epoch": 2.864902684136878, + "grad_norm": 1.587652325630188, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8900904059410095, + "num_tokens": 859230349.0, + "step": 22521 + }, + { + "epoch": 2.8650298944154686, + "grad_norm": 1.5774357318878174, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8813589811325073, + "num_tokens": 859267694.0, + "step": 22522 + }, + { + "epoch": 2.865157104694059, + "grad_norm": 1.4907071590423584, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.895357608795166, + "num_tokens": 859306479.0, + "step": 22523 + }, + { + "epoch": 2.8652843149726497, + "grad_norm": 1.4445455074310303, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.883258581161499, + "num_tokens": 859349569.0, + "step": 22524 + }, + { + "epoch": 2.86541152525124, + "grad_norm": 1.5084238052368164, + "learning_rate": 1e-06, + "loss": 0.279, + "mean_token_accuracy": 0.8981359004974365, + "num_tokens": 859387300.0, + "step": 22525 + }, + { + "epoch": 2.8655387355298307, + "grad_norm": 1.5752105712890625, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8814253211021423, + "num_tokens": 859424882.0, + "step": 22526 + }, + { + "epoch": 2.8656659458084213, + "grad_norm": 1.5159668922424316, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8919103145599365, + "num_tokens": 859462943.0, + "step": 22527 + }, + { + "epoch": 2.865793156087012, + "grad_norm": 1.5305140018463135, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.88538658618927, + "num_tokens": 859503944.0, + "step": 22528 + }, + { + "epoch": 2.8659203663656023, + "grad_norm": 1.4090700149536133, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8858435153961182, + "num_tokens": 859551270.0, + "step": 22529 + }, + { + "epoch": 2.866047576644193, + "grad_norm": 1.6536451578140259, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.885948896408081, + "num_tokens": 859584604.0, + "step": 22530 + }, + { + "epoch": 2.8661747869227834, + "grad_norm": 1.5454990863800049, + "learning_rate": 1e-06, + "loss": 0.2738, + "mean_token_accuracy": 0.8991556167602539, + "num_tokens": 859621161.0, + "step": 22531 + }, + { + "epoch": 2.866301997201374, + "grad_norm": 1.4983946084976196, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8894146680831909, + "num_tokens": 859661509.0, + "step": 22532 + }, + { + "epoch": 2.8664292074799644, + "grad_norm": 1.6183185577392578, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8867281675338745, + "num_tokens": 859695490.0, + "step": 22533 + }, + { + "epoch": 2.866556417758555, + "grad_norm": 1.6630421876907349, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.879673182964325, + "num_tokens": 859729873.0, + "step": 22534 + }, + { + "epoch": 2.8666836280371455, + "grad_norm": 1.6001555919647217, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8887999653816223, + "num_tokens": 859765769.0, + "step": 22535 + }, + { + "epoch": 2.866810838315736, + "grad_norm": 1.556233525276184, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8853768110275269, + "num_tokens": 859799886.0, + "step": 22536 + }, + { + "epoch": 2.8669380485943265, + "grad_norm": 1.4704463481903076, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8949698209762573, + "num_tokens": 859839060.0, + "step": 22537 + }, + { + "epoch": 2.867065258872917, + "grad_norm": 1.6029301881790161, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8727128505706787, + "num_tokens": 859880670.0, + "step": 22538 + }, + { + "epoch": 2.8671924691515076, + "grad_norm": 1.5766286849975586, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8818023204803467, + "num_tokens": 859919584.0, + "step": 22539 + }, + { + "epoch": 2.867319679430098, + "grad_norm": 1.4938820600509644, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8860095739364624, + "num_tokens": 859962073.0, + "step": 22540 + }, + { + "epoch": 2.867446889708688, + "grad_norm": 1.4523688554763794, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8864954710006714, + "num_tokens": 860003495.0, + "step": 22541 + }, + { + "epoch": 2.867574099987279, + "grad_norm": 1.6461488008499146, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8643192052841187, + "num_tokens": 860042035.0, + "step": 22542 + }, + { + "epoch": 2.8677013102658693, + "grad_norm": 1.4670977592468262, + "learning_rate": 1e-06, + "loss": 0.2677, + "mean_token_accuracy": 0.9032529592514038, + "num_tokens": 860081591.0, + "step": 22543 + }, + { + "epoch": 2.8678285205444602, + "grad_norm": 1.6927696466445923, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8930245041847229, + "num_tokens": 860112795.0, + "step": 22544 + }, + { + "epoch": 2.8679557308230503, + "grad_norm": 1.6747565269470215, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8795289993286133, + "num_tokens": 860149031.0, + "step": 22545 + }, + { + "epoch": 2.868082941101641, + "grad_norm": 1.6492316722869873, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8829877376556396, + "num_tokens": 860186816.0, + "step": 22546 + }, + { + "epoch": 2.8682101513802314, + "grad_norm": 1.676866888999939, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8808941841125488, + "num_tokens": 860227288.0, + "step": 22547 + }, + { + "epoch": 2.868337361658822, + "grad_norm": 1.6440227031707764, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8882291316986084, + "num_tokens": 860263179.0, + "step": 22548 + }, + { + "epoch": 2.8684645719374124, + "grad_norm": 1.6833744049072266, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8896944522857666, + "num_tokens": 860296655.0, + "step": 22549 + }, + { + "epoch": 2.868591782216003, + "grad_norm": 1.7044882774353027, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8865444660186768, + "num_tokens": 860328387.0, + "step": 22550 + }, + { + "epoch": 2.8687189924945935, + "grad_norm": 1.5745280981063843, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8807644844055176, + "num_tokens": 860364558.0, + "step": 22551 + }, + { + "epoch": 2.868846202773184, + "grad_norm": 1.5205975770950317, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8666225075721741, + "num_tokens": 860409370.0, + "step": 22552 + }, + { + "epoch": 2.8689734130517746, + "grad_norm": 1.6002856492996216, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.887299120426178, + "num_tokens": 860443988.0, + "step": 22553 + }, + { + "epoch": 2.869100623330365, + "grad_norm": 1.4344861507415771, + "learning_rate": 1e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.8966064453125, + "num_tokens": 860486646.0, + "step": 22554 + }, + { + "epoch": 2.8692278336089556, + "grad_norm": 1.3019498586654663, + "learning_rate": 1e-06, + "loss": 0.2483, + "mean_token_accuracy": 0.9107519388198853, + "num_tokens": 860533239.0, + "step": 22555 + }, + { + "epoch": 2.869355043887546, + "grad_norm": 1.43865966796875, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8858471512794495, + "num_tokens": 860572325.0, + "step": 22556 + }, + { + "epoch": 2.8694822541661367, + "grad_norm": 1.5687015056610107, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8820379376411438, + "num_tokens": 860613580.0, + "step": 22557 + }, + { + "epoch": 2.869609464444727, + "grad_norm": 1.684084177017212, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8887332081794739, + "num_tokens": 860647916.0, + "step": 22558 + }, + { + "epoch": 2.8697366747233177, + "grad_norm": 1.6963729858398438, + "learning_rate": 1e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.8946406841278076, + "num_tokens": 860676836.0, + "step": 22559 + }, + { + "epoch": 2.8698638850019083, + "grad_norm": 1.4717333316802979, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8871948719024658, + "num_tokens": 860717794.0, + "step": 22560 + }, + { + "epoch": 2.869991095280499, + "grad_norm": 1.6590782403945923, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8796607255935669, + "num_tokens": 860754498.0, + "step": 22561 + }, + { + "epoch": 2.8701183055590893, + "grad_norm": 1.6326932907104492, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.893643319606781, + "num_tokens": 860788288.0, + "step": 22562 + }, + { + "epoch": 2.87024551583768, + "grad_norm": 1.5025629997253418, + "learning_rate": 1e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.8946228623390198, + "num_tokens": 860824340.0, + "step": 22563 + }, + { + "epoch": 2.87037272611627, + "grad_norm": 1.6114615201950073, + "learning_rate": 1e-06, + "loss": 0.2758, + "mean_token_accuracy": 0.8957763910293579, + "num_tokens": 860855562.0, + "step": 22564 + }, + { + "epoch": 2.870499936394861, + "grad_norm": 1.4471124410629272, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.8958242535591125, + "num_tokens": 860897693.0, + "step": 22565 + }, + { + "epoch": 2.870627146673451, + "grad_norm": 1.5936248302459717, + "learning_rate": 1e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.8940662145614624, + "num_tokens": 860933805.0, + "step": 22566 + }, + { + "epoch": 2.870754356952042, + "grad_norm": 1.4850696325302124, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8844332695007324, + "num_tokens": 860975752.0, + "step": 22567 + }, + { + "epoch": 2.870881567230632, + "grad_norm": 1.567978858947754, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8862445950508118, + "num_tokens": 861013038.0, + "step": 22568 + }, + { + "epoch": 2.871008777509223, + "grad_norm": 1.674092173576355, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8644567131996155, + "num_tokens": 861051590.0, + "step": 22569 + }, + { + "epoch": 2.871135987787813, + "grad_norm": 1.5261465311050415, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8936231732368469, + "num_tokens": 861090056.0, + "step": 22570 + }, + { + "epoch": 2.8712631980664036, + "grad_norm": 1.5183149576187134, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8806909322738647, + "num_tokens": 861127686.0, + "step": 22571 + }, + { + "epoch": 2.871390408344994, + "grad_norm": 1.4293617010116577, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8950209617614746, + "num_tokens": 861169309.0, + "step": 22572 + }, + { + "epoch": 2.8715176186235847, + "grad_norm": 1.5317739248275757, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8832215666770935, + "num_tokens": 861208782.0, + "step": 22573 + }, + { + "epoch": 2.871644828902175, + "grad_norm": 1.6882473230361938, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8687582612037659, + "num_tokens": 861245777.0, + "step": 22574 + }, + { + "epoch": 2.8717720391807657, + "grad_norm": 1.4044392108917236, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8870578408241272, + "num_tokens": 861290047.0, + "step": 22575 + }, + { + "epoch": 2.8718992494593563, + "grad_norm": 1.6420772075653076, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8833078145980835, + "num_tokens": 861326101.0, + "step": 22576 + }, + { + "epoch": 2.872026459737947, + "grad_norm": 1.5468051433563232, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8867747783660889, + "num_tokens": 861361771.0, + "step": 22577 + }, + { + "epoch": 2.8721536700165373, + "grad_norm": 1.468704104423523, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8827664852142334, + "num_tokens": 861404288.0, + "step": 22578 + }, + { + "epoch": 2.872280880295128, + "grad_norm": 1.5090166330337524, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8945209383964539, + "num_tokens": 861440272.0, + "step": 22579 + }, + { + "epoch": 2.8724080905737184, + "grad_norm": 1.4411506652832031, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8851836919784546, + "num_tokens": 861484130.0, + "step": 22580 + }, + { + "epoch": 2.872535300852309, + "grad_norm": 1.5146369934082031, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8930840492248535, + "num_tokens": 861520775.0, + "step": 22581 + }, + { + "epoch": 2.8726625111308994, + "grad_norm": 1.6339811086654663, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8825135827064514, + "num_tokens": 861557064.0, + "step": 22582 + }, + { + "epoch": 2.87278972140949, + "grad_norm": 1.4153724908828735, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8889555931091309, + "num_tokens": 861601776.0, + "step": 22583 + }, + { + "epoch": 2.8729169316880805, + "grad_norm": 1.540082335472107, + "learning_rate": 1e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.8981532454490662, + "num_tokens": 861639022.0, + "step": 22584 + }, + { + "epoch": 2.873044141966671, + "grad_norm": 1.472154974937439, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8854966163635254, + "num_tokens": 861679056.0, + "step": 22585 + }, + { + "epoch": 2.8731713522452615, + "grad_norm": 1.3311536312103271, + "learning_rate": 1e-06, + "loss": 0.2671, + "mean_token_accuracy": 0.9016821384429932, + "num_tokens": 861722113.0, + "step": 22586 + }, + { + "epoch": 2.873298562523852, + "grad_norm": 1.68084716796875, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8771284818649292, + "num_tokens": 861760909.0, + "step": 22587 + }, + { + "epoch": 2.8734257728024426, + "grad_norm": 1.569227695465088, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.880595326423645, + "num_tokens": 861802262.0, + "step": 22588 + }, + { + "epoch": 2.8735529830810327, + "grad_norm": 1.510627031326294, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8893667459487915, + "num_tokens": 861839958.0, + "step": 22589 + }, + { + "epoch": 2.8736801933596237, + "grad_norm": 1.574265718460083, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8869839310646057, + "num_tokens": 861880238.0, + "step": 22590 + }, + { + "epoch": 2.8738074036382137, + "grad_norm": 1.6040849685668945, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8911351561546326, + "num_tokens": 861914965.0, + "step": 22591 + }, + { + "epoch": 2.8739346139168047, + "grad_norm": 1.4235800504684448, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8820106387138367, + "num_tokens": 861963546.0, + "step": 22592 + }, + { + "epoch": 2.874061824195395, + "grad_norm": 1.6059455871582031, + "learning_rate": 1e-06, + "loss": 0.2726, + "mean_token_accuracy": 0.901458740234375, + "num_tokens": 862001198.0, + "step": 22593 + }, + { + "epoch": 2.8741890344739858, + "grad_norm": 1.618881344795227, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.880017876625061, + "num_tokens": 862037508.0, + "step": 22594 + }, + { + "epoch": 2.874316244752576, + "grad_norm": 1.46341073513031, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8785538077354431, + "num_tokens": 862082412.0, + "step": 22595 + }, + { + "epoch": 2.8744434550311664, + "grad_norm": 1.6306724548339844, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8865646719932556, + "num_tokens": 862117141.0, + "step": 22596 + }, + { + "epoch": 2.874570665309757, + "grad_norm": 1.5346044301986694, + "learning_rate": 1e-06, + "loss": 0.2442, + "mean_token_accuracy": 0.9089397192001343, + "num_tokens": 862150059.0, + "step": 22597 + }, + { + "epoch": 2.8746978755883474, + "grad_norm": 1.475441336631775, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8872557282447815, + "num_tokens": 862191935.0, + "step": 22598 + }, + { + "epoch": 2.874825085866938, + "grad_norm": 1.5892984867095947, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8788820505142212, + "num_tokens": 862228197.0, + "step": 22599 + }, + { + "epoch": 2.8749522961455285, + "grad_norm": 1.5853323936462402, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8889328837394714, + "num_tokens": 862267319.0, + "step": 22600 + }, + { + "epoch": 2.875079506424119, + "grad_norm": 1.5079121589660645, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8822965025901794, + "num_tokens": 862306634.0, + "step": 22601 + }, + { + "epoch": 2.8752067167027096, + "grad_norm": 1.5629445314407349, + "learning_rate": 1e-06, + "loss": 0.2781, + "mean_token_accuracy": 0.9000707864761353, + "num_tokens": 862340824.0, + "step": 22602 + }, + { + "epoch": 2.8753339269813, + "grad_norm": 1.5327579975128174, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.892945408821106, + "num_tokens": 862379770.0, + "step": 22603 + }, + { + "epoch": 2.8754611372598906, + "grad_norm": 1.4597046375274658, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8912928104400635, + "num_tokens": 862422624.0, + "step": 22604 + }, + { + "epoch": 2.875588347538481, + "grad_norm": 1.538450837135315, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8810561895370483, + "num_tokens": 862462780.0, + "step": 22605 + }, + { + "epoch": 2.8757155578170717, + "grad_norm": 1.429837942123413, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8795169591903687, + "num_tokens": 862511310.0, + "step": 22606 + }, + { + "epoch": 2.875842768095662, + "grad_norm": 1.5723017454147339, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8951255083084106, + "num_tokens": 862550719.0, + "step": 22607 + }, + { + "epoch": 2.8759699783742527, + "grad_norm": 1.619683027267456, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8902917504310608, + "num_tokens": 862584476.0, + "step": 22608 + }, + { + "epoch": 2.8760971886528433, + "grad_norm": 1.768389344215393, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8672992587089539, + "num_tokens": 862621476.0, + "step": 22609 + }, + { + "epoch": 2.876224398931434, + "grad_norm": 1.8484407663345337, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8688953518867493, + "num_tokens": 862654876.0, + "step": 22610 + }, + { + "epoch": 2.8763516092100243, + "grad_norm": 1.5789518356323242, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8879362940788269, + "num_tokens": 862692712.0, + "step": 22611 + }, + { + "epoch": 2.876478819488615, + "grad_norm": 1.4104535579681396, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8895598649978638, + "num_tokens": 862735740.0, + "step": 22612 + }, + { + "epoch": 2.8766060297672054, + "grad_norm": 1.4436578750610352, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8843812942504883, + "num_tokens": 862779883.0, + "step": 22613 + }, + { + "epoch": 2.8767332400457954, + "grad_norm": 1.3836004734039307, + "learning_rate": 1e-06, + "loss": 0.2776, + "mean_token_accuracy": 0.8986806273460388, + "num_tokens": 862820370.0, + "step": 22614 + }, + { + "epoch": 2.8768604503243864, + "grad_norm": 1.6609437465667725, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8773956894874573, + "num_tokens": 862858312.0, + "step": 22615 + }, + { + "epoch": 2.8769876606029765, + "grad_norm": 1.6545288562774658, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8872868418693542, + "num_tokens": 862889674.0, + "step": 22616 + }, + { + "epoch": 2.8771148708815675, + "grad_norm": 1.4868316650390625, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8810606002807617, + "num_tokens": 862931702.0, + "step": 22617 + }, + { + "epoch": 2.8772420811601576, + "grad_norm": 1.636056661605835, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8755545616149902, + "num_tokens": 862969969.0, + "step": 22618 + }, + { + "epoch": 2.877369291438748, + "grad_norm": 1.4854226112365723, + "learning_rate": 1e-06, + "loss": 0.2847, + "mean_token_accuracy": 0.8959506154060364, + "num_tokens": 863010251.0, + "step": 22619 + }, + { + "epoch": 2.8774965017173386, + "grad_norm": 1.509645938873291, + "learning_rate": 1e-06, + "loss": 0.2745, + "mean_token_accuracy": 0.8998950123786926, + "num_tokens": 863047721.0, + "step": 22620 + }, + { + "epoch": 2.877623711995929, + "grad_norm": 1.6345707178115845, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8847628235816956, + "num_tokens": 863082687.0, + "step": 22621 + }, + { + "epoch": 2.8777509222745197, + "grad_norm": 1.471237063407898, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8898257613182068, + "num_tokens": 863122460.0, + "step": 22622 + }, + { + "epoch": 2.87787813255311, + "grad_norm": 1.5739649534225464, + "learning_rate": 1e-06, + "loss": 0.288, + "mean_token_accuracy": 0.8973463773727417, + "num_tokens": 863156432.0, + "step": 22623 + }, + { + "epoch": 2.8780053428317007, + "grad_norm": 1.6433318853378296, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8775136470794678, + "num_tokens": 863191438.0, + "step": 22624 + }, + { + "epoch": 2.8781325531102913, + "grad_norm": 1.570947527885437, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.887965977191925, + "num_tokens": 863227815.0, + "step": 22625 + }, + { + "epoch": 2.878259763388882, + "grad_norm": 1.5160505771636963, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8875502347946167, + "num_tokens": 863266191.0, + "step": 22626 + }, + { + "epoch": 2.8783869736674723, + "grad_norm": 1.4278980493545532, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8895591497421265, + "num_tokens": 863308198.0, + "step": 22627 + }, + { + "epoch": 2.878514183946063, + "grad_norm": 1.5214475393295288, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8828549981117249, + "num_tokens": 863347574.0, + "step": 22628 + }, + { + "epoch": 2.8786413942246534, + "grad_norm": 1.5431671142578125, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8846962451934814, + "num_tokens": 863385237.0, + "step": 22629 + }, + { + "epoch": 2.878768604503244, + "grad_norm": 1.4662972688674927, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8801345229148865, + "num_tokens": 863428860.0, + "step": 22630 + }, + { + "epoch": 2.8788958147818344, + "grad_norm": 1.4846056699752808, + "learning_rate": 1e-06, + "loss": 0.2579, + "mean_token_accuracy": 0.9033591747283936, + "num_tokens": 863463943.0, + "step": 22631 + }, + { + "epoch": 2.879023025060425, + "grad_norm": 1.7599884271621704, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8708471059799194, + "num_tokens": 863498445.0, + "step": 22632 + }, + { + "epoch": 2.8791502353390155, + "grad_norm": 1.3900902271270752, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8937819600105286, + "num_tokens": 863542495.0, + "step": 22633 + }, + { + "epoch": 2.879277445617606, + "grad_norm": 1.5322664976119995, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8975359797477722, + "num_tokens": 863580864.0, + "step": 22634 + }, + { + "epoch": 2.8794046558961965, + "grad_norm": 1.6233693361282349, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8663268089294434, + "num_tokens": 863620897.0, + "step": 22635 + }, + { + "epoch": 2.879531866174787, + "grad_norm": 1.5282492637634277, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.8950524926185608, + "num_tokens": 863655770.0, + "step": 22636 + }, + { + "epoch": 2.8796590764533776, + "grad_norm": 1.5924113988876343, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8852589726448059, + "num_tokens": 863697110.0, + "step": 22637 + }, + { + "epoch": 2.879786286731968, + "grad_norm": 1.4182039499282837, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8921189904212952, + "num_tokens": 863740131.0, + "step": 22638 + }, + { + "epoch": 2.879913497010558, + "grad_norm": 1.5144833326339722, + "learning_rate": 1e-06, + "loss": 0.2742, + "mean_token_accuracy": 0.9014172554016113, + "num_tokens": 863777617.0, + "step": 22639 + }, + { + "epoch": 2.880040707289149, + "grad_norm": 1.5083507299423218, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.886066734790802, + "num_tokens": 863817521.0, + "step": 22640 + }, + { + "epoch": 2.8801679175677393, + "grad_norm": 1.594632625579834, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8837157487869263, + "num_tokens": 863852492.0, + "step": 22641 + }, + { + "epoch": 2.8802951278463302, + "grad_norm": 1.6468061208724976, + "learning_rate": 1e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.8982149362564087, + "num_tokens": 863886789.0, + "step": 22642 + }, + { + "epoch": 2.8804223381249203, + "grad_norm": 1.5109630823135376, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8691529035568237, + "num_tokens": 863928754.0, + "step": 22643 + }, + { + "epoch": 2.880549548403511, + "grad_norm": 1.4634320735931396, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8854568004608154, + "num_tokens": 863971312.0, + "step": 22644 + }, + { + "epoch": 2.8806767586821014, + "grad_norm": 1.5068877935409546, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8893535137176514, + "num_tokens": 864012306.0, + "step": 22645 + }, + { + "epoch": 2.880803968960692, + "grad_norm": 1.489784598350525, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8868750333786011, + "num_tokens": 864053281.0, + "step": 22646 + }, + { + "epoch": 2.8809311792392824, + "grad_norm": 1.462937593460083, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8962796330451965, + "num_tokens": 864092830.0, + "step": 22647 + }, + { + "epoch": 2.881058389517873, + "grad_norm": 1.5997501611709595, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8851141929626465, + "num_tokens": 864130186.0, + "step": 22648 + }, + { + "epoch": 2.8811855997964635, + "grad_norm": 1.3783243894577026, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.891880452632904, + "num_tokens": 864175037.0, + "step": 22649 + }, + { + "epoch": 2.881312810075054, + "grad_norm": 1.6149158477783203, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8869409561157227, + "num_tokens": 864208540.0, + "step": 22650 + }, + { + "epoch": 2.8814400203536445, + "grad_norm": 1.5630384683609009, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8606146574020386, + "num_tokens": 864252361.0, + "step": 22651 + }, + { + "epoch": 2.881567230632235, + "grad_norm": 1.5556102991104126, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8802402019500732, + "num_tokens": 864292316.0, + "step": 22652 + }, + { + "epoch": 2.8816944409108256, + "grad_norm": 1.4114795923233032, + "learning_rate": 1e-06, + "loss": 0.2725, + "mean_token_accuracy": 0.8996468186378479, + "num_tokens": 864331442.0, + "step": 22653 + }, + { + "epoch": 2.881821651189416, + "grad_norm": 1.4950768947601318, + "learning_rate": 1e-06, + "loss": 0.285, + "mean_token_accuracy": 0.8959318995475769, + "num_tokens": 864371374.0, + "step": 22654 + }, + { + "epoch": 2.8819488614680067, + "grad_norm": 1.59845769405365, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8798747658729553, + "num_tokens": 864406760.0, + "step": 22655 + }, + { + "epoch": 2.882076071746597, + "grad_norm": 1.6573947668075562, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8922327160835266, + "num_tokens": 864439398.0, + "step": 22656 + }, + { + "epoch": 2.8822032820251877, + "grad_norm": 1.5554858446121216, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.877058207988739, + "num_tokens": 864475989.0, + "step": 22657 + }, + { + "epoch": 2.8823304923037782, + "grad_norm": 1.4310221672058105, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8835623264312744, + "num_tokens": 864520341.0, + "step": 22658 + }, + { + "epoch": 2.8824577025823688, + "grad_norm": 1.6430848836898804, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8753122687339783, + "num_tokens": 864559940.0, + "step": 22659 + }, + { + "epoch": 2.8825849128609593, + "grad_norm": 1.4804174900054932, + "learning_rate": 1e-06, + "loss": 0.2646, + "mean_token_accuracy": 0.9042980074882507, + "num_tokens": 864597587.0, + "step": 22660 + }, + { + "epoch": 2.88271212313955, + "grad_norm": 1.5869115591049194, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8749507665634155, + "num_tokens": 864638717.0, + "step": 22661 + }, + { + "epoch": 2.88283933341814, + "grad_norm": 1.602466106414795, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8817715644836426, + "num_tokens": 864676680.0, + "step": 22662 + }, + { + "epoch": 2.882966543696731, + "grad_norm": 1.5330718755722046, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8839124441146851, + "num_tokens": 864714501.0, + "step": 22663 + }, + { + "epoch": 2.883093753975321, + "grad_norm": 1.54537832736969, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8750572800636292, + "num_tokens": 864754393.0, + "step": 22664 + }, + { + "epoch": 2.883220964253912, + "grad_norm": 1.4548654556274414, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8903394937515259, + "num_tokens": 864797379.0, + "step": 22665 + }, + { + "epoch": 2.883348174532502, + "grad_norm": 1.4459633827209473, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.8941095471382141, + "num_tokens": 864839330.0, + "step": 22666 + }, + { + "epoch": 2.883475384811093, + "grad_norm": 1.5832473039627075, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8789247274398804, + "num_tokens": 864877668.0, + "step": 22667 + }, + { + "epoch": 2.883602595089683, + "grad_norm": 1.5166966915130615, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.8963263034820557, + "num_tokens": 864916578.0, + "step": 22668 + }, + { + "epoch": 2.8837298053682736, + "grad_norm": 1.499548077583313, + "learning_rate": 1e-06, + "loss": 0.2587, + "mean_token_accuracy": 0.907160758972168, + "num_tokens": 864954151.0, + "step": 22669 + }, + { + "epoch": 2.883857015646864, + "grad_norm": 1.4107997417449951, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.8929709792137146, + "num_tokens": 864994931.0, + "step": 22670 + }, + { + "epoch": 2.8839842259254547, + "grad_norm": 1.579638123512268, + "learning_rate": 1e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.8984898924827576, + "num_tokens": 865030073.0, + "step": 22671 + }, + { + "epoch": 2.884111436204045, + "grad_norm": 1.61739182472229, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8812845945358276, + "num_tokens": 865064915.0, + "step": 22672 + }, + { + "epoch": 2.8842386464826357, + "grad_norm": 1.605043649673462, + "learning_rate": 1e-06, + "loss": 0.2715, + "mean_token_accuracy": 0.9036877751350403, + "num_tokens": 865100102.0, + "step": 22673 + }, + { + "epoch": 2.8843658567612263, + "grad_norm": 1.5567197799682617, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.8916593790054321, + "num_tokens": 865138333.0, + "step": 22674 + }, + { + "epoch": 2.884493067039817, + "grad_norm": 1.6181529760360718, + "learning_rate": 1e-06, + "loss": 0.2841, + "mean_token_accuracy": 0.8955279588699341, + "num_tokens": 865173259.0, + "step": 22675 + }, + { + "epoch": 2.8846202773184073, + "grad_norm": 1.5402419567108154, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8880693912506104, + "num_tokens": 865209344.0, + "step": 22676 + }, + { + "epoch": 2.884747487596998, + "grad_norm": 1.3449654579162598, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.8960422277450562, + "num_tokens": 865255333.0, + "step": 22677 + }, + { + "epoch": 2.8848746978755884, + "grad_norm": 1.3921128511428833, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8885440230369568, + "num_tokens": 865297521.0, + "step": 22678 + }, + { + "epoch": 2.885001908154179, + "grad_norm": 1.6361265182495117, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8777463436126709, + "num_tokens": 865333046.0, + "step": 22679 + }, + { + "epoch": 2.8851291184327694, + "grad_norm": 1.4344830513000488, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8867088556289673, + "num_tokens": 865378468.0, + "step": 22680 + }, + { + "epoch": 2.88525632871136, + "grad_norm": 1.605617880821228, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8908616900444031, + "num_tokens": 865411795.0, + "step": 22681 + }, + { + "epoch": 2.8853835389899505, + "grad_norm": 1.6536518335342407, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8820312023162842, + "num_tokens": 865446734.0, + "step": 22682 + }, + { + "epoch": 2.885510749268541, + "grad_norm": 1.5707595348358154, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8693773150444031, + "num_tokens": 865489497.0, + "step": 22683 + }, + { + "epoch": 2.8856379595471315, + "grad_norm": 1.4815605878829956, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8965498208999634, + "num_tokens": 865528336.0, + "step": 22684 + }, + { + "epoch": 2.885765169825722, + "grad_norm": 1.682440996170044, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8954294919967651, + "num_tokens": 865564008.0, + "step": 22685 + }, + { + "epoch": 2.8858923801043126, + "grad_norm": 1.6321513652801514, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8651396632194519, + "num_tokens": 865599728.0, + "step": 22686 + }, + { + "epoch": 2.8860195903829027, + "grad_norm": 1.5759743452072144, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8899145126342773, + "num_tokens": 865634147.0, + "step": 22687 + }, + { + "epoch": 2.8861468006614936, + "grad_norm": 1.5262537002563477, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.875565230846405, + "num_tokens": 865675979.0, + "step": 22688 + }, + { + "epoch": 2.8862740109400837, + "grad_norm": 1.5184651613235474, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8932090401649475, + "num_tokens": 865713618.0, + "step": 22689 + }, + { + "epoch": 2.8864012212186747, + "grad_norm": 1.6995426416397095, + "learning_rate": 1e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.8988606929779053, + "num_tokens": 865741490.0, + "step": 22690 + }, + { + "epoch": 2.886528431497265, + "grad_norm": 1.5649501085281372, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8931522369384766, + "num_tokens": 865775750.0, + "step": 22691 + }, + { + "epoch": 2.8866556417758558, + "grad_norm": 1.5070488452911377, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8820929527282715, + "num_tokens": 865817156.0, + "step": 22692 + }, + { + "epoch": 2.886782852054446, + "grad_norm": 1.412948727607727, + "learning_rate": 1e-06, + "loss": 0.2893, + "mean_token_accuracy": 0.8957681059837341, + "num_tokens": 865858024.0, + "step": 22693 + }, + { + "epoch": 2.8869100623330364, + "grad_norm": 1.4651689529418945, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8835501670837402, + "num_tokens": 865897344.0, + "step": 22694 + }, + { + "epoch": 2.887037272611627, + "grad_norm": 1.5635583400726318, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8909274339675903, + "num_tokens": 865934367.0, + "step": 22695 + }, + { + "epoch": 2.8871644828902174, + "grad_norm": 1.538300633430481, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8852372169494629, + "num_tokens": 865976091.0, + "step": 22696 + }, + { + "epoch": 2.887291693168808, + "grad_norm": 1.5927612781524658, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8746048212051392, + "num_tokens": 866018313.0, + "step": 22697 + }, + { + "epoch": 2.8874189034473985, + "grad_norm": 1.4618945121765137, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8746946454048157, + "num_tokens": 866065077.0, + "step": 22698 + }, + { + "epoch": 2.887546113725989, + "grad_norm": 1.4890916347503662, + "learning_rate": 1e-06, + "loss": 0.2589, + "mean_token_accuracy": 0.9058981537818909, + "num_tokens": 866098350.0, + "step": 22699 + }, + { + "epoch": 2.8876733240045795, + "grad_norm": 1.5538440942764282, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8835065960884094, + "num_tokens": 866137416.0, + "step": 22700 + }, + { + "epoch": 2.88780053428317, + "grad_norm": 1.5533571243286133, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8906427621841431, + "num_tokens": 866177311.0, + "step": 22701 + }, + { + "epoch": 2.8879277445617606, + "grad_norm": 1.6611541509628296, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8629084229469299, + "num_tokens": 866216466.0, + "step": 22702 + }, + { + "epoch": 2.888054954840351, + "grad_norm": 1.649827241897583, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8744333386421204, + "num_tokens": 866253857.0, + "step": 22703 + }, + { + "epoch": 2.8881821651189417, + "grad_norm": 1.5291575193405151, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8941243886947632, + "num_tokens": 866289585.0, + "step": 22704 + }, + { + "epoch": 2.888309375397532, + "grad_norm": 1.521801233291626, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8846205472946167, + "num_tokens": 866327774.0, + "step": 22705 + }, + { + "epoch": 2.8884365856761227, + "grad_norm": 1.482191801071167, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8832775354385376, + "num_tokens": 866371879.0, + "step": 22706 + }, + { + "epoch": 2.8885637959547132, + "grad_norm": 1.547184944152832, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8668022751808167, + "num_tokens": 866414732.0, + "step": 22707 + }, + { + "epoch": 2.8886910062333038, + "grad_norm": 1.53705894947052, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8809012174606323, + "num_tokens": 866454173.0, + "step": 22708 + }, + { + "epoch": 2.8888182165118943, + "grad_norm": 1.5629576444625854, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8896905183792114, + "num_tokens": 866492075.0, + "step": 22709 + }, + { + "epoch": 2.888945426790485, + "grad_norm": 1.596286416053772, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8813327550888062, + "num_tokens": 866529847.0, + "step": 22710 + }, + { + "epoch": 2.8890726370690754, + "grad_norm": 1.500749945640564, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8886023759841919, + "num_tokens": 866567790.0, + "step": 22711 + }, + { + "epoch": 2.8891998473476654, + "grad_norm": 1.6158447265625, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8735620379447937, + "num_tokens": 866604802.0, + "step": 22712 + }, + { + "epoch": 2.8893270576262564, + "grad_norm": 1.4765872955322266, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8743418455123901, + "num_tokens": 866652124.0, + "step": 22713 + }, + { + "epoch": 2.8894542679048465, + "grad_norm": 1.6226054430007935, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8884608745574951, + "num_tokens": 866687036.0, + "step": 22714 + }, + { + "epoch": 2.8895814781834375, + "grad_norm": 1.6562799215316772, + "learning_rate": 1e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.899027943611145, + "num_tokens": 866718533.0, + "step": 22715 + }, + { + "epoch": 2.8897086884620276, + "grad_norm": 1.6835153102874756, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8845551013946533, + "num_tokens": 866750166.0, + "step": 22716 + }, + { + "epoch": 2.889835898740618, + "grad_norm": 1.6313319206237793, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8813040852546692, + "num_tokens": 866782589.0, + "step": 22717 + }, + { + "epoch": 2.8899631090192086, + "grad_norm": 1.614822506904602, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.871720016002655, + "num_tokens": 866823113.0, + "step": 22718 + }, + { + "epoch": 2.890090319297799, + "grad_norm": 1.4870811700820923, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8833346366882324, + "num_tokens": 866865704.0, + "step": 22719 + }, + { + "epoch": 2.8902175295763897, + "grad_norm": 1.5965791940689087, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8836501836776733, + "num_tokens": 866901259.0, + "step": 22720 + }, + { + "epoch": 2.89034473985498, + "grad_norm": 1.4393287897109985, + "learning_rate": 1e-06, + "loss": 0.2745, + "mean_token_accuracy": 0.8981068730354309, + "num_tokens": 866941032.0, + "step": 22721 + }, + { + "epoch": 2.8904719501335707, + "grad_norm": 1.5731099843978882, + "learning_rate": 1e-06, + "loss": 0.2603, + "mean_token_accuracy": 0.9050721526145935, + "num_tokens": 866977266.0, + "step": 22722 + }, + { + "epoch": 2.8905991604121613, + "grad_norm": 1.6412889957427979, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8939648866653442, + "num_tokens": 867011991.0, + "step": 22723 + }, + { + "epoch": 2.890726370690752, + "grad_norm": 1.4969974756240845, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8853311538696289, + "num_tokens": 867052108.0, + "step": 22724 + }, + { + "epoch": 2.8908535809693423, + "grad_norm": 1.6147756576538086, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8858491778373718, + "num_tokens": 867088523.0, + "step": 22725 + }, + { + "epoch": 2.890980791247933, + "grad_norm": 1.5923823118209839, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8836855888366699, + "num_tokens": 867126446.0, + "step": 22726 + }, + { + "epoch": 2.8911080015265234, + "grad_norm": 1.7350351810455322, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8632948398590088, + "num_tokens": 867159776.0, + "step": 22727 + }, + { + "epoch": 2.891235211805114, + "grad_norm": 1.4330605268478394, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.887903094291687, + "num_tokens": 867199887.0, + "step": 22728 + }, + { + "epoch": 2.8913624220837044, + "grad_norm": 1.4675687551498413, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8908634781837463, + "num_tokens": 867242101.0, + "step": 22729 + }, + { + "epoch": 2.891489632362295, + "grad_norm": 1.4073466062545776, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8957982063293457, + "num_tokens": 867282099.0, + "step": 22730 + }, + { + "epoch": 2.8916168426408855, + "grad_norm": 1.550321340560913, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8923304677009583, + "num_tokens": 867316551.0, + "step": 22731 + }, + { + "epoch": 2.891744052919476, + "grad_norm": 1.4941800832748413, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8891043663024902, + "num_tokens": 867354225.0, + "step": 22732 + }, + { + "epoch": 2.8918712631980665, + "grad_norm": 1.64254891872406, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8876236081123352, + "num_tokens": 867394762.0, + "step": 22733 + }, + { + "epoch": 2.891998473476657, + "grad_norm": 1.605636715888977, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8817763924598694, + "num_tokens": 867429157.0, + "step": 22734 + }, + { + "epoch": 2.8921256837552476, + "grad_norm": 1.4911271333694458, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8984336853027344, + "num_tokens": 867464697.0, + "step": 22735 + }, + { + "epoch": 2.892252894033838, + "grad_norm": 1.4514198303222656, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8766642808914185, + "num_tokens": 867508888.0, + "step": 22736 + }, + { + "epoch": 2.892380104312428, + "grad_norm": 1.513150691986084, + "learning_rate": 1e-06, + "loss": 0.2757, + "mean_token_accuracy": 0.8992652297019958, + "num_tokens": 867544679.0, + "step": 22737 + }, + { + "epoch": 2.892507314591019, + "grad_norm": 1.6997909545898438, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8806928396224976, + "num_tokens": 867578410.0, + "step": 22738 + }, + { + "epoch": 2.8926345248696093, + "grad_norm": 1.5352795124053955, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8894661664962769, + "num_tokens": 867616182.0, + "step": 22739 + }, + { + "epoch": 2.8927617351482002, + "grad_norm": 1.4688084125518799, + "learning_rate": 1e-06, + "loss": 0.2657, + "mean_token_accuracy": 0.9040411710739136, + "num_tokens": 867653963.0, + "step": 22740 + }, + { + "epoch": 2.8928889454267903, + "grad_norm": 1.4601894617080688, + "learning_rate": 1e-06, + "loss": 0.2783, + "mean_token_accuracy": 0.9007854461669922, + "num_tokens": 867691746.0, + "step": 22741 + }, + { + "epoch": 2.893016155705381, + "grad_norm": 1.5559799671173096, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8966298699378967, + "num_tokens": 867728734.0, + "step": 22742 + }, + { + "epoch": 2.8931433659839714, + "grad_norm": 1.5399489402770996, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8858484029769897, + "num_tokens": 867765424.0, + "step": 22743 + }, + { + "epoch": 2.893270576262562, + "grad_norm": 1.7445213794708252, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8865138292312622, + "num_tokens": 867796779.0, + "step": 22744 + }, + { + "epoch": 2.8933977865411524, + "grad_norm": 1.5694365501403809, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8869719505310059, + "num_tokens": 867837859.0, + "step": 22745 + }, + { + "epoch": 2.893524996819743, + "grad_norm": 1.6806492805480957, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8615210056304932, + "num_tokens": 867877653.0, + "step": 22746 + }, + { + "epoch": 2.8936522070983335, + "grad_norm": 1.3923372030258179, + "learning_rate": 1e-06, + "loss": 0.2718, + "mean_token_accuracy": 0.8995876312255859, + "num_tokens": 867920580.0, + "step": 22747 + }, + { + "epoch": 2.893779417376924, + "grad_norm": 1.4817496538162231, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8786394596099854, + "num_tokens": 867961553.0, + "step": 22748 + }, + { + "epoch": 2.8939066276555145, + "grad_norm": 1.4776349067687988, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8915214538574219, + "num_tokens": 868001573.0, + "step": 22749 + }, + { + "epoch": 2.894033837934105, + "grad_norm": 1.7184977531433105, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8769818544387817, + "num_tokens": 868038283.0, + "step": 22750 + }, + { + "epoch": 2.8941610482126956, + "grad_norm": 1.484906792640686, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8806917667388916, + "num_tokens": 868080094.0, + "step": 22751 + }, + { + "epoch": 2.894288258491286, + "grad_norm": 1.757849097251892, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8840853571891785, + "num_tokens": 868115577.0, + "step": 22752 + }, + { + "epoch": 2.8944154687698767, + "grad_norm": 1.5290148258209229, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8779894113540649, + "num_tokens": 868154058.0, + "step": 22753 + }, + { + "epoch": 2.894542679048467, + "grad_norm": 1.6023602485656738, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8855240345001221, + "num_tokens": 868192263.0, + "step": 22754 + }, + { + "epoch": 2.8946698893270577, + "grad_norm": 1.6012749671936035, + "learning_rate": 1e-06, + "loss": 0.278, + "mean_token_accuracy": 0.8993518352508545, + "num_tokens": 868227094.0, + "step": 22755 + }, + { + "epoch": 2.8947970996056482, + "grad_norm": 1.532606601715088, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8906958103179932, + "num_tokens": 868264567.0, + "step": 22756 + }, + { + "epoch": 2.8949243098842388, + "grad_norm": 1.6326285600662231, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8923981785774231, + "num_tokens": 868297049.0, + "step": 22757 + }, + { + "epoch": 2.8950515201628293, + "grad_norm": 1.5838868618011475, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8796564340591431, + "num_tokens": 868336114.0, + "step": 22758 + }, + { + "epoch": 2.89517873044142, + "grad_norm": 1.648098111152649, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8795076608657837, + "num_tokens": 868369468.0, + "step": 22759 + }, + { + "epoch": 2.89530594072001, + "grad_norm": 1.6188030242919922, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8819655776023865, + "num_tokens": 868406284.0, + "step": 22760 + }, + { + "epoch": 2.895433150998601, + "grad_norm": 1.4732171297073364, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8869704008102417, + "num_tokens": 868444709.0, + "step": 22761 + }, + { + "epoch": 2.895560361277191, + "grad_norm": 1.5592492818832397, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8808850049972534, + "num_tokens": 868483662.0, + "step": 22762 + }, + { + "epoch": 2.895687571555782, + "grad_norm": 1.4314155578613281, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8774824142456055, + "num_tokens": 868528937.0, + "step": 22763 + }, + { + "epoch": 2.895814781834372, + "grad_norm": 1.5536649227142334, + "learning_rate": 1e-06, + "loss": 0.2718, + "mean_token_accuracy": 0.8998051881790161, + "num_tokens": 868563774.0, + "step": 22764 + }, + { + "epoch": 2.895941992112963, + "grad_norm": 1.5648640394210815, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.884810209274292, + "num_tokens": 868602967.0, + "step": 22765 + }, + { + "epoch": 2.896069202391553, + "grad_norm": 1.449690341949463, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8909982442855835, + "num_tokens": 868643043.0, + "step": 22766 + }, + { + "epoch": 2.8961964126701436, + "grad_norm": 1.4805829524993896, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8923729658126831, + "num_tokens": 868681397.0, + "step": 22767 + }, + { + "epoch": 2.896323622948734, + "grad_norm": 1.5216971635818481, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8836444616317749, + "num_tokens": 868721691.0, + "step": 22768 + }, + { + "epoch": 2.8964508332273247, + "grad_norm": 1.5795120000839233, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8839704990386963, + "num_tokens": 868764570.0, + "step": 22769 + }, + { + "epoch": 2.896578043505915, + "grad_norm": 1.5259172916412354, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8934292793273926, + "num_tokens": 868801489.0, + "step": 22770 + }, + { + "epoch": 2.8967052537845057, + "grad_norm": 1.5361740589141846, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.880562424659729, + "num_tokens": 868841557.0, + "step": 22771 + }, + { + "epoch": 2.8968324640630962, + "grad_norm": 1.5496022701263428, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8805775046348572, + "num_tokens": 868879870.0, + "step": 22772 + }, + { + "epoch": 2.8969596743416868, + "grad_norm": 1.5555413961410522, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8900534510612488, + "num_tokens": 868914710.0, + "step": 22773 + }, + { + "epoch": 2.8970868846202773, + "grad_norm": 1.8222072124481201, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8959083557128906, + "num_tokens": 868949817.0, + "step": 22774 + }, + { + "epoch": 2.897214094898868, + "grad_norm": 1.3557714223861694, + "learning_rate": 1e-06, + "loss": 0.2589, + "mean_token_accuracy": 0.907023012638092, + "num_tokens": 868988082.0, + "step": 22775 + }, + { + "epoch": 2.8973413051774584, + "grad_norm": 1.5964549779891968, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.882525622844696, + "num_tokens": 869022901.0, + "step": 22776 + }, + { + "epoch": 2.897468515456049, + "grad_norm": 1.592819333076477, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8797347545623779, + "num_tokens": 869057061.0, + "step": 22777 + }, + { + "epoch": 2.8975957257346394, + "grad_norm": 1.521721601486206, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8950222134590149, + "num_tokens": 869094424.0, + "step": 22778 + }, + { + "epoch": 2.89772293601323, + "grad_norm": 1.3695621490478516, + "learning_rate": 1e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.8967021703720093, + "num_tokens": 869134148.0, + "step": 22779 + }, + { + "epoch": 2.8978501462918205, + "grad_norm": 1.5019289255142212, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8815289735794067, + "num_tokens": 869174640.0, + "step": 22780 + }, + { + "epoch": 2.897977356570411, + "grad_norm": 1.466017723083496, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8838447332382202, + "num_tokens": 869215725.0, + "step": 22781 + }, + { + "epoch": 2.8981045668490015, + "grad_norm": 1.5562849044799805, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8801054358482361, + "num_tokens": 869252965.0, + "step": 22782 + }, + { + "epoch": 2.898231777127592, + "grad_norm": 1.6194705963134766, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8876825571060181, + "num_tokens": 869286671.0, + "step": 22783 + }, + { + "epoch": 2.8983589874061826, + "grad_norm": 1.517466425895691, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8832076787948608, + "num_tokens": 869325661.0, + "step": 22784 + }, + { + "epoch": 2.8984861976847727, + "grad_norm": 1.6342856884002686, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.876589298248291, + "num_tokens": 869362108.0, + "step": 22785 + }, + { + "epoch": 2.8986134079633636, + "grad_norm": 1.4727729558944702, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8919867277145386, + "num_tokens": 869400656.0, + "step": 22786 + }, + { + "epoch": 2.8987406182419537, + "grad_norm": 1.6499441862106323, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8758235573768616, + "num_tokens": 869436773.0, + "step": 22787 + }, + { + "epoch": 2.8988678285205447, + "grad_norm": 1.5491485595703125, + "learning_rate": 1e-06, + "loss": 0.2547, + "mean_token_accuracy": 0.9063795804977417, + "num_tokens": 869469208.0, + "step": 22788 + }, + { + "epoch": 2.898995038799135, + "grad_norm": 1.480845332145691, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8951738476753235, + "num_tokens": 869507343.0, + "step": 22789 + }, + { + "epoch": 2.8991222490777258, + "grad_norm": 1.39302659034729, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.9003217220306396, + "num_tokens": 869549310.0, + "step": 22790 + }, + { + "epoch": 2.899249459356316, + "grad_norm": 1.4969606399536133, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8842310309410095, + "num_tokens": 869589584.0, + "step": 22791 + }, + { + "epoch": 2.8993766696349064, + "grad_norm": 1.5286940336227417, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8792479634284973, + "num_tokens": 869629962.0, + "step": 22792 + }, + { + "epoch": 2.899503879913497, + "grad_norm": 1.5191926956176758, + "learning_rate": 1e-06, + "loss": 0.2846, + "mean_token_accuracy": 0.896665632724762, + "num_tokens": 869671919.0, + "step": 22793 + }, + { + "epoch": 2.8996310901920874, + "grad_norm": 1.4830384254455566, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.882998526096344, + "num_tokens": 869714998.0, + "step": 22794 + }, + { + "epoch": 2.899758300470678, + "grad_norm": 1.537514567375183, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.8933607339859009, + "num_tokens": 869752246.0, + "step": 22795 + }, + { + "epoch": 2.8998855107492685, + "grad_norm": 1.7168601751327515, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8572397232055664, + "num_tokens": 869788982.0, + "step": 22796 + }, + { + "epoch": 2.900012721027859, + "grad_norm": 1.4760082960128784, + "learning_rate": 1e-06, + "loss": 0.2833, + "mean_token_accuracy": 0.8965053558349609, + "num_tokens": 869825918.0, + "step": 22797 + }, + { + "epoch": 2.9001399313064495, + "grad_norm": 1.5628358125686646, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8841915130615234, + "num_tokens": 869864557.0, + "step": 22798 + }, + { + "epoch": 2.90026714158504, + "grad_norm": 1.507863998413086, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8732627630233765, + "num_tokens": 869904908.0, + "step": 22799 + }, + { + "epoch": 2.9003943518636306, + "grad_norm": 1.5949151515960693, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8841989040374756, + "num_tokens": 869940652.0, + "step": 22800 + }, + { + "epoch": 2.900521562142221, + "grad_norm": 1.5783162117004395, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8929505348205566, + "num_tokens": 869980910.0, + "step": 22801 + }, + { + "epoch": 2.9006487724208116, + "grad_norm": 1.5421268939971924, + "learning_rate": 1e-06, + "loss": 0.2827, + "mean_token_accuracy": 0.8966504335403442, + "num_tokens": 870017703.0, + "step": 22802 + }, + { + "epoch": 2.900775982699402, + "grad_norm": 1.5899204015731812, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8721683621406555, + "num_tokens": 870056813.0, + "step": 22803 + }, + { + "epoch": 2.9009031929779927, + "grad_norm": 1.5871381759643555, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8771569728851318, + "num_tokens": 870093601.0, + "step": 22804 + }, + { + "epoch": 2.9010304032565832, + "grad_norm": 1.5712227821350098, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8845707178115845, + "num_tokens": 870134913.0, + "step": 22805 + }, + { + "epoch": 2.9011576135351738, + "grad_norm": 1.4415977001190186, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8797589540481567, + "num_tokens": 870180876.0, + "step": 22806 + }, + { + "epoch": 2.9012848238137643, + "grad_norm": 1.4225636720657349, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8853360414505005, + "num_tokens": 870222445.0, + "step": 22807 + }, + { + "epoch": 2.901412034092355, + "grad_norm": 1.4531259536743164, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8890612721443176, + "num_tokens": 870262647.0, + "step": 22808 + }, + { + "epoch": 2.9015392443709453, + "grad_norm": 1.5831838846206665, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8777779340744019, + "num_tokens": 870298679.0, + "step": 22809 + }, + { + "epoch": 2.9016664546495354, + "grad_norm": 1.5814262628555298, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8854126930236816, + "num_tokens": 870336276.0, + "step": 22810 + }, + { + "epoch": 2.9017936649281264, + "grad_norm": 1.498200535774231, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8953235745429993, + "num_tokens": 870373653.0, + "step": 22811 + }, + { + "epoch": 2.9019208752067165, + "grad_norm": 1.6198017597198486, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8766932487487793, + "num_tokens": 870412015.0, + "step": 22812 + }, + { + "epoch": 2.9020480854853075, + "grad_norm": 1.7144157886505127, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8682278394699097, + "num_tokens": 870445613.0, + "step": 22813 + }, + { + "epoch": 2.9021752957638975, + "grad_norm": 1.5404558181762695, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8867871761322021, + "num_tokens": 870486782.0, + "step": 22814 + }, + { + "epoch": 2.902302506042488, + "grad_norm": 1.7872518301010132, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8823710083961487, + "num_tokens": 870522411.0, + "step": 22815 + }, + { + "epoch": 2.9024297163210786, + "grad_norm": 1.5453788042068481, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8813095688819885, + "num_tokens": 870563164.0, + "step": 22816 + }, + { + "epoch": 2.902556926599669, + "grad_norm": 1.546715259552002, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8881017565727234, + "num_tokens": 870599585.0, + "step": 22817 + }, + { + "epoch": 2.9026841368782597, + "grad_norm": 1.7438799142837524, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8911991119384766, + "num_tokens": 870634162.0, + "step": 22818 + }, + { + "epoch": 2.90281134715685, + "grad_norm": 1.8174537420272827, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8749008178710938, + "num_tokens": 870662668.0, + "step": 22819 + }, + { + "epoch": 2.9029385574354407, + "grad_norm": 1.6189976930618286, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8827703595161438, + "num_tokens": 870696940.0, + "step": 22820 + }, + { + "epoch": 2.9030657677140312, + "grad_norm": 1.7005435228347778, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8683645725250244, + "num_tokens": 870732097.0, + "step": 22821 + }, + { + "epoch": 2.9031929779926218, + "grad_norm": 1.5916610956192017, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.870313823223114, + "num_tokens": 870775566.0, + "step": 22822 + }, + { + "epoch": 2.9033201882712123, + "grad_norm": 1.547014832496643, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8915157914161682, + "num_tokens": 870812225.0, + "step": 22823 + }, + { + "epoch": 2.903447398549803, + "grad_norm": 1.6356946229934692, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8853905200958252, + "num_tokens": 870845704.0, + "step": 22824 + }, + { + "epoch": 2.9035746088283934, + "grad_norm": 1.603662133216858, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8810150623321533, + "num_tokens": 870885449.0, + "step": 22825 + }, + { + "epoch": 2.903701819106984, + "grad_norm": 1.6328372955322266, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8692783117294312, + "num_tokens": 870926088.0, + "step": 22826 + }, + { + "epoch": 2.9038290293855744, + "grad_norm": 1.5710691213607788, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8785046339035034, + "num_tokens": 870966367.0, + "step": 22827 + }, + { + "epoch": 2.903956239664165, + "grad_norm": 1.677872896194458, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8844897150993347, + "num_tokens": 870997498.0, + "step": 22828 + }, + { + "epoch": 2.9040834499427555, + "grad_norm": 1.5439649820327759, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8892302513122559, + "num_tokens": 871035186.0, + "step": 22829 + }, + { + "epoch": 2.904210660221346, + "grad_norm": 1.5218631029129028, + "learning_rate": 1e-06, + "loss": 0.2806, + "mean_token_accuracy": 0.8972524404525757, + "num_tokens": 871069859.0, + "step": 22830 + }, + { + "epoch": 2.9043378704999365, + "grad_norm": 1.4779655933380127, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8847404718399048, + "num_tokens": 871113399.0, + "step": 22831 + }, + { + "epoch": 2.904465080778527, + "grad_norm": 1.4803968667984009, + "learning_rate": 1e-06, + "loss": 0.275, + "mean_token_accuracy": 0.9005324840545654, + "num_tokens": 871150754.0, + "step": 22832 + }, + { + "epoch": 2.9045922910571176, + "grad_norm": 1.6378554105758667, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.878950834274292, + "num_tokens": 871185653.0, + "step": 22833 + }, + { + "epoch": 2.904719501335708, + "grad_norm": 1.4970049858093262, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8832303285598755, + "num_tokens": 871225336.0, + "step": 22834 + }, + { + "epoch": 2.904846711614298, + "grad_norm": 1.6359515190124512, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8812578916549683, + "num_tokens": 871261408.0, + "step": 22835 + }, + { + "epoch": 2.904973921892889, + "grad_norm": 1.5363670587539673, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8721767067909241, + "num_tokens": 871307124.0, + "step": 22836 + }, + { + "epoch": 2.9051011321714793, + "grad_norm": 1.5432780981063843, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.886127233505249, + "num_tokens": 871343953.0, + "step": 22837 + }, + { + "epoch": 2.9052283424500702, + "grad_norm": 1.449455738067627, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8870357871055603, + "num_tokens": 871386793.0, + "step": 22838 + }, + { + "epoch": 2.9053555527286603, + "grad_norm": 1.6831655502319336, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8842889666557312, + "num_tokens": 871421362.0, + "step": 22839 + }, + { + "epoch": 2.905482763007251, + "grad_norm": 1.5020027160644531, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8814277648925781, + "num_tokens": 871464180.0, + "step": 22840 + }, + { + "epoch": 2.9056099732858414, + "grad_norm": 1.652539610862732, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8817169666290283, + "num_tokens": 871499606.0, + "step": 22841 + }, + { + "epoch": 2.905737183564432, + "grad_norm": 1.608946681022644, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8688907623291016, + "num_tokens": 871536674.0, + "step": 22842 + }, + { + "epoch": 2.9058643938430224, + "grad_norm": 1.5279147624969482, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8955087065696716, + "num_tokens": 871577489.0, + "step": 22843 + }, + { + "epoch": 2.905991604121613, + "grad_norm": 1.6517659425735474, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8871921896934509, + "num_tokens": 871612661.0, + "step": 22844 + }, + { + "epoch": 2.9061188144002035, + "grad_norm": 1.5899301767349243, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8899182081222534, + "num_tokens": 871646525.0, + "step": 22845 + }, + { + "epoch": 2.906246024678794, + "grad_norm": 1.5608150959014893, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8700771331787109, + "num_tokens": 871686402.0, + "step": 22846 + }, + { + "epoch": 2.9063732349573845, + "grad_norm": 1.5135626792907715, + "learning_rate": 1e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.8953790068626404, + "num_tokens": 871721900.0, + "step": 22847 + }, + { + "epoch": 2.906500445235975, + "grad_norm": 1.4065191745758057, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8909716606140137, + "num_tokens": 871766058.0, + "step": 22848 + }, + { + "epoch": 2.9066276555145656, + "grad_norm": 1.589829683303833, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8941879272460938, + "num_tokens": 871801191.0, + "step": 22849 + }, + { + "epoch": 2.906754865793156, + "grad_norm": 1.6271382570266724, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8894960284233093, + "num_tokens": 871835043.0, + "step": 22850 + }, + { + "epoch": 2.9068820760717466, + "grad_norm": 1.6527856588363647, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8751618266105652, + "num_tokens": 871871584.0, + "step": 22851 + }, + { + "epoch": 2.907009286350337, + "grad_norm": 1.674534797668457, + "learning_rate": 1e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.8971052169799805, + "num_tokens": 871904522.0, + "step": 22852 + }, + { + "epoch": 2.9071364966289277, + "grad_norm": 1.49947988986969, + "learning_rate": 1e-06, + "loss": 0.2786, + "mean_token_accuracy": 0.9002061486244202, + "num_tokens": 871945872.0, + "step": 22853 + }, + { + "epoch": 2.9072637069075182, + "grad_norm": 1.643165111541748, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.88631671667099, + "num_tokens": 871985160.0, + "step": 22854 + }, + { + "epoch": 2.9073909171861088, + "grad_norm": 1.6320972442626953, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8620977997779846, + "num_tokens": 872029104.0, + "step": 22855 + }, + { + "epoch": 2.9075181274646993, + "grad_norm": 1.4481431245803833, + "learning_rate": 1e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.8993493318557739, + "num_tokens": 872071126.0, + "step": 22856 + }, + { + "epoch": 2.90764533774329, + "grad_norm": 1.570313811302185, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8810606002807617, + "num_tokens": 872110524.0, + "step": 22857 + }, + { + "epoch": 2.90777254802188, + "grad_norm": 1.457698106765747, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8925269842147827, + "num_tokens": 872153065.0, + "step": 22858 + }, + { + "epoch": 2.907899758300471, + "grad_norm": 1.6454334259033203, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8881813883781433, + "num_tokens": 872192498.0, + "step": 22859 + }, + { + "epoch": 2.908026968579061, + "grad_norm": 1.5932331085205078, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8803727626800537, + "num_tokens": 872230760.0, + "step": 22860 + }, + { + "epoch": 2.908154178857652, + "grad_norm": 1.676037311553955, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.878748893737793, + "num_tokens": 872268308.0, + "step": 22861 + }, + { + "epoch": 2.908281389136242, + "grad_norm": 1.5515375137329102, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8747401833534241, + "num_tokens": 872304652.0, + "step": 22862 + }, + { + "epoch": 2.908408599414833, + "grad_norm": 1.4522292613983154, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8925042152404785, + "num_tokens": 872348966.0, + "step": 22863 + }, + { + "epoch": 2.908535809693423, + "grad_norm": 1.4971117973327637, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8850783109664917, + "num_tokens": 872390034.0, + "step": 22864 + }, + { + "epoch": 2.9086630199720136, + "grad_norm": 1.4899513721466064, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8874579668045044, + "num_tokens": 872425856.0, + "step": 22865 + }, + { + "epoch": 2.908790230250604, + "grad_norm": 1.54097318649292, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8838869333267212, + "num_tokens": 872464185.0, + "step": 22866 + }, + { + "epoch": 2.9089174405291947, + "grad_norm": 1.4538252353668213, + "learning_rate": 1e-06, + "loss": 0.2623, + "mean_token_accuracy": 0.9030473828315735, + "num_tokens": 872501418.0, + "step": 22867 + }, + { + "epoch": 2.909044650807785, + "grad_norm": 1.5261385440826416, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8794609904289246, + "num_tokens": 872542015.0, + "step": 22868 + }, + { + "epoch": 2.9091718610863757, + "grad_norm": 1.6004520654678345, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.867522656917572, + "num_tokens": 872580884.0, + "step": 22869 + }, + { + "epoch": 2.9092990713649662, + "grad_norm": 1.5256718397140503, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8827733397483826, + "num_tokens": 872616253.0, + "step": 22870 + }, + { + "epoch": 2.9094262816435568, + "grad_norm": 1.4914168119430542, + "learning_rate": 1e-06, + "loss": 0.2634, + "mean_token_accuracy": 0.9045613408088684, + "num_tokens": 872650871.0, + "step": 22871 + }, + { + "epoch": 2.9095534919221473, + "grad_norm": 1.5357401371002197, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8882066607475281, + "num_tokens": 872688108.0, + "step": 22872 + }, + { + "epoch": 2.909680702200738, + "grad_norm": 1.7674747705459595, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8869308233261108, + "num_tokens": 872719532.0, + "step": 22873 + }, + { + "epoch": 2.9098079124793284, + "grad_norm": 1.6301132440567017, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8843528628349304, + "num_tokens": 872759823.0, + "step": 22874 + }, + { + "epoch": 2.909935122757919, + "grad_norm": 1.6455920934677124, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8779109716415405, + "num_tokens": 872796195.0, + "step": 22875 + }, + { + "epoch": 2.9100623330365094, + "grad_norm": 1.522384762763977, + "learning_rate": 1e-06, + "loss": 0.2773, + "mean_token_accuracy": 0.89974045753479, + "num_tokens": 872836754.0, + "step": 22876 + }, + { + "epoch": 2.9101895433151, + "grad_norm": 1.580660343170166, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.886563777923584, + "num_tokens": 872875265.0, + "step": 22877 + }, + { + "epoch": 2.9103167535936905, + "grad_norm": 1.7938743829727173, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8669860363006592, + "num_tokens": 872910063.0, + "step": 22878 + }, + { + "epoch": 2.910443963872281, + "grad_norm": 1.4939450025558472, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8880900144577026, + "num_tokens": 872951761.0, + "step": 22879 + }, + { + "epoch": 2.9105711741508715, + "grad_norm": 1.5912352800369263, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.8984001278877258, + "num_tokens": 872987661.0, + "step": 22880 + }, + { + "epoch": 2.910698384429462, + "grad_norm": 1.5596576929092407, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8905450105667114, + "num_tokens": 873024596.0, + "step": 22881 + }, + { + "epoch": 2.9108255947080526, + "grad_norm": 1.5400680303573608, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8901435732841492, + "num_tokens": 873061727.0, + "step": 22882 + }, + { + "epoch": 2.9109528049866427, + "grad_norm": 1.7411309480667114, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8868327736854553, + "num_tokens": 873094418.0, + "step": 22883 + }, + { + "epoch": 2.9110800152652336, + "grad_norm": 1.5955835580825806, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8540204763412476, + "num_tokens": 873137628.0, + "step": 22884 + }, + { + "epoch": 2.9112072255438237, + "grad_norm": 1.617864727973938, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8924735188484192, + "num_tokens": 873170316.0, + "step": 22885 + }, + { + "epoch": 2.9113344358224147, + "grad_norm": 1.5059714317321777, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8922974467277527, + "num_tokens": 873212026.0, + "step": 22886 + }, + { + "epoch": 2.9114616461010048, + "grad_norm": 1.4796719551086426, + "learning_rate": 1e-06, + "loss": 0.292, + "mean_token_accuracy": 0.8954654335975647, + "num_tokens": 873253410.0, + "step": 22887 + }, + { + "epoch": 2.9115888563795957, + "grad_norm": 1.6279305219650269, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8856793642044067, + "num_tokens": 873288949.0, + "step": 22888 + }, + { + "epoch": 2.911716066658186, + "grad_norm": 1.5625649690628052, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8859856128692627, + "num_tokens": 873328247.0, + "step": 22889 + }, + { + "epoch": 2.9118432769367764, + "grad_norm": 1.5502361059188843, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8766429424285889, + "num_tokens": 873373761.0, + "step": 22890 + }, + { + "epoch": 2.911970487215367, + "grad_norm": 1.4556764364242554, + "learning_rate": 1e-06, + "loss": 0.2866, + "mean_token_accuracy": 0.8967330455780029, + "num_tokens": 873413839.0, + "step": 22891 + }, + { + "epoch": 2.9120976974939574, + "grad_norm": 1.595741629600525, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8857229948043823, + "num_tokens": 873448686.0, + "step": 22892 + }, + { + "epoch": 2.912224907772548, + "grad_norm": 1.710868239402771, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8860434889793396, + "num_tokens": 873479668.0, + "step": 22893 + }, + { + "epoch": 2.9123521180511385, + "grad_norm": 1.5264461040496826, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8862090110778809, + "num_tokens": 873516546.0, + "step": 22894 + }, + { + "epoch": 2.912479328329729, + "grad_norm": 1.566123604774475, + "learning_rate": 1e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.8936905860900879, + "num_tokens": 873550196.0, + "step": 22895 + }, + { + "epoch": 2.9126065386083195, + "grad_norm": 1.539367437362671, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8712918162345886, + "num_tokens": 873594203.0, + "step": 22896 + }, + { + "epoch": 2.91273374888691, + "grad_norm": 1.5304694175720215, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.889157235622406, + "num_tokens": 873630933.0, + "step": 22897 + }, + { + "epoch": 2.9128609591655006, + "grad_norm": 1.375709056854248, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8831196427345276, + "num_tokens": 873678652.0, + "step": 22898 + }, + { + "epoch": 2.912988169444091, + "grad_norm": 1.449316143989563, + "learning_rate": 1e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.9006537199020386, + "num_tokens": 873715447.0, + "step": 22899 + }, + { + "epoch": 2.9131153797226816, + "grad_norm": 1.7732073068618774, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8512382507324219, + "num_tokens": 873752934.0, + "step": 22900 + }, + { + "epoch": 2.913242590001272, + "grad_norm": 1.5274494886398315, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8815404772758484, + "num_tokens": 873792715.0, + "step": 22901 + }, + { + "epoch": 2.9133698002798627, + "grad_norm": 1.6238192319869995, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8879494667053223, + "num_tokens": 873826812.0, + "step": 22902 + }, + { + "epoch": 2.9134970105584532, + "grad_norm": 1.5607130527496338, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8900117874145508, + "num_tokens": 873864487.0, + "step": 22903 + }, + { + "epoch": 2.9136242208370438, + "grad_norm": 1.48853600025177, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8776895999908447, + "num_tokens": 873904858.0, + "step": 22904 + }, + { + "epoch": 2.9137514311156343, + "grad_norm": 1.6368972063064575, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8702709674835205, + "num_tokens": 873941342.0, + "step": 22905 + }, + { + "epoch": 2.913878641394225, + "grad_norm": 1.5267822742462158, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8914347887039185, + "num_tokens": 873979901.0, + "step": 22906 + }, + { + "epoch": 2.9140058516728153, + "grad_norm": 1.57034170627594, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8710657358169556, + "num_tokens": 874022504.0, + "step": 22907 + }, + { + "epoch": 2.9141330619514054, + "grad_norm": 1.6196340322494507, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.885266900062561, + "num_tokens": 874057004.0, + "step": 22908 + }, + { + "epoch": 2.9142602722299964, + "grad_norm": 1.5556225776672363, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8936982750892639, + "num_tokens": 874088896.0, + "step": 22909 + }, + { + "epoch": 2.9143874825085865, + "grad_norm": 1.4712588787078857, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.890279233455658, + "num_tokens": 874130296.0, + "step": 22910 + }, + { + "epoch": 2.9145146927871775, + "grad_norm": 1.5268290042877197, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8846806287765503, + "num_tokens": 874171085.0, + "step": 22911 + }, + { + "epoch": 2.9146419030657675, + "grad_norm": 1.470285415649414, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.884200930595398, + "num_tokens": 874215834.0, + "step": 22912 + }, + { + "epoch": 2.914769113344358, + "grad_norm": 1.58201003074646, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8712671995162964, + "num_tokens": 874256731.0, + "step": 22913 + }, + { + "epoch": 2.9148963236229486, + "grad_norm": 1.5325356721878052, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.879510760307312, + "num_tokens": 874296908.0, + "step": 22914 + }, + { + "epoch": 2.915023533901539, + "grad_norm": 1.5941282510757446, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8863104581832886, + "num_tokens": 874334918.0, + "step": 22915 + }, + { + "epoch": 2.9151507441801296, + "grad_norm": 1.6381312608718872, + "learning_rate": 1e-06, + "loss": 0.2773, + "mean_token_accuracy": 0.8950471878051758, + "num_tokens": 874365592.0, + "step": 22916 + }, + { + "epoch": 2.91527795445872, + "grad_norm": 1.4388577938079834, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8938809633255005, + "num_tokens": 874407187.0, + "step": 22917 + }, + { + "epoch": 2.9154051647373107, + "grad_norm": 1.5397565364837646, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.8954242467880249, + "num_tokens": 874446510.0, + "step": 22918 + }, + { + "epoch": 2.9155323750159012, + "grad_norm": 1.5941355228424072, + "learning_rate": 1e-06, + "loss": 0.2853, + "mean_token_accuracy": 0.8950375914573669, + "num_tokens": 874481702.0, + "step": 22919 + }, + { + "epoch": 2.9156595852944918, + "grad_norm": 1.4341460466384888, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8878823518753052, + "num_tokens": 874526015.0, + "step": 22920 + }, + { + "epoch": 2.9157867955730823, + "grad_norm": 1.962764024734497, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8918226957321167, + "num_tokens": 874553188.0, + "step": 22921 + }, + { + "epoch": 2.915914005851673, + "grad_norm": 1.6290020942687988, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.894690990447998, + "num_tokens": 874585895.0, + "step": 22922 + }, + { + "epoch": 2.9160412161302633, + "grad_norm": 1.5033140182495117, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8754546642303467, + "num_tokens": 874627203.0, + "step": 22923 + }, + { + "epoch": 2.916168426408854, + "grad_norm": 1.7620869874954224, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8789214491844177, + "num_tokens": 874658955.0, + "step": 22924 + }, + { + "epoch": 2.9162956366874444, + "grad_norm": 1.6606684923171997, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8833647966384888, + "num_tokens": 874692696.0, + "step": 22925 + }, + { + "epoch": 2.916422846966035, + "grad_norm": 1.5784010887145996, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8784934282302856, + "num_tokens": 874732827.0, + "step": 22926 + }, + { + "epoch": 2.9165500572446255, + "grad_norm": 1.5144047737121582, + "learning_rate": 1e-06, + "loss": 0.2797, + "mean_token_accuracy": 0.8989837765693665, + "num_tokens": 874770094.0, + "step": 22927 + }, + { + "epoch": 2.916677267523216, + "grad_norm": 1.5132856369018555, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.883610725402832, + "num_tokens": 874810986.0, + "step": 22928 + }, + { + "epoch": 2.9168044778018065, + "grad_norm": 1.455589771270752, + "learning_rate": 1e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.8938488960266113, + "num_tokens": 874848541.0, + "step": 22929 + }, + { + "epoch": 2.916931688080397, + "grad_norm": 1.4648011922836304, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8974272608757019, + "num_tokens": 874886851.0, + "step": 22930 + }, + { + "epoch": 2.917058898358987, + "grad_norm": 1.5151491165161133, + "learning_rate": 1e-06, + "loss": 0.2354, + "mean_token_accuracy": 0.9134923219680786, + "num_tokens": 874918647.0, + "step": 22931 + }, + { + "epoch": 2.917186108637578, + "grad_norm": 1.517874002456665, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8834439516067505, + "num_tokens": 874958899.0, + "step": 22932 + }, + { + "epoch": 2.917313318916168, + "grad_norm": 1.3726552724838257, + "learning_rate": 1e-06, + "loss": 0.2584, + "mean_token_accuracy": 0.9047918319702148, + "num_tokens": 874999727.0, + "step": 22933 + }, + { + "epoch": 2.917440529194759, + "grad_norm": 1.4983725547790527, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8909854292869568, + "num_tokens": 875038108.0, + "step": 22934 + }, + { + "epoch": 2.9175677394733492, + "grad_norm": 1.5222843885421753, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8954472541809082, + "num_tokens": 875076491.0, + "step": 22935 + }, + { + "epoch": 2.91769494975194, + "grad_norm": 1.506434679031372, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8820581436157227, + "num_tokens": 875116690.0, + "step": 22936 + }, + { + "epoch": 2.9178221600305303, + "grad_norm": 1.4995348453521729, + "learning_rate": 1e-06, + "loss": 0.2784, + "mean_token_accuracy": 0.8969519734382629, + "num_tokens": 875153694.0, + "step": 22937 + }, + { + "epoch": 2.917949370309121, + "grad_norm": 1.554254174232483, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.888788104057312, + "num_tokens": 875192446.0, + "step": 22938 + }, + { + "epoch": 2.9180765805877114, + "grad_norm": 1.6267744302749634, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8738877177238464, + "num_tokens": 875230496.0, + "step": 22939 + }, + { + "epoch": 2.918203790866302, + "grad_norm": 3.5911617279052734, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8777165412902832, + "num_tokens": 875266544.0, + "step": 22940 + }, + { + "epoch": 2.9183310011448924, + "grad_norm": 1.5676517486572266, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8911291360855103, + "num_tokens": 875305129.0, + "step": 22941 + }, + { + "epoch": 2.918458211423483, + "grad_norm": 1.606845498085022, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8771094083786011, + "num_tokens": 875343861.0, + "step": 22942 + }, + { + "epoch": 2.9185854217020735, + "grad_norm": 1.6002098321914673, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.886752724647522, + "num_tokens": 875384694.0, + "step": 22943 + }, + { + "epoch": 2.918712631980664, + "grad_norm": 1.4923834800720215, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8870428800582886, + "num_tokens": 875426520.0, + "step": 22944 + }, + { + "epoch": 2.9188398422592545, + "grad_norm": 1.562732219696045, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.892731785774231, + "num_tokens": 875464791.0, + "step": 22945 + }, + { + "epoch": 2.918967052537845, + "grad_norm": 1.4621210098266602, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8984930515289307, + "num_tokens": 875504646.0, + "step": 22946 + }, + { + "epoch": 2.9190942628164356, + "grad_norm": 1.5085430145263672, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.8921372294425964, + "num_tokens": 875543878.0, + "step": 22947 + }, + { + "epoch": 2.919221473095026, + "grad_norm": 1.5482892990112305, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8917516469955444, + "num_tokens": 875580475.0, + "step": 22948 + }, + { + "epoch": 2.9193486833736166, + "grad_norm": 1.5642619132995605, + "learning_rate": 1e-06, + "loss": 0.2784, + "mean_token_accuracy": 0.897878885269165, + "num_tokens": 875615779.0, + "step": 22949 + }, + { + "epoch": 2.919475893652207, + "grad_norm": 1.5140544176101685, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8806302547454834, + "num_tokens": 875657261.0, + "step": 22950 + }, + { + "epoch": 2.9196031039307977, + "grad_norm": 1.4162520170211792, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8940268754959106, + "num_tokens": 875699667.0, + "step": 22951 + }, + { + "epoch": 2.9197303142093882, + "grad_norm": 1.4943358898162842, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8905192017555237, + "num_tokens": 875738438.0, + "step": 22952 + }, + { + "epoch": 2.9198575244879788, + "grad_norm": 1.6283583641052246, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8672864437103271, + "num_tokens": 875779326.0, + "step": 22953 + }, + { + "epoch": 2.9199847347665693, + "grad_norm": 1.5763648748397827, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8899065256118774, + "num_tokens": 875816352.0, + "step": 22954 + }, + { + "epoch": 2.92011194504516, + "grad_norm": 1.5749796628952026, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.8982962369918823, + "num_tokens": 875852401.0, + "step": 22955 + }, + { + "epoch": 2.92023915532375, + "grad_norm": 1.5365042686462402, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8862900733947754, + "num_tokens": 875890620.0, + "step": 22956 + }, + { + "epoch": 2.920366365602341, + "grad_norm": 1.5195903778076172, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8900750875473022, + "num_tokens": 875928861.0, + "step": 22957 + }, + { + "epoch": 2.920493575880931, + "grad_norm": 1.52236008644104, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8946692943572998, + "num_tokens": 875967983.0, + "step": 22958 + }, + { + "epoch": 2.920620786159522, + "grad_norm": 1.5227437019348145, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8782572746276855, + "num_tokens": 876006692.0, + "step": 22959 + }, + { + "epoch": 2.920747996438112, + "grad_norm": 1.526224136352539, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8857842683792114, + "num_tokens": 876047884.0, + "step": 22960 + }, + { + "epoch": 2.920875206716703, + "grad_norm": 1.4388084411621094, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8788122534751892, + "num_tokens": 876093472.0, + "step": 22961 + }, + { + "epoch": 2.921002416995293, + "grad_norm": 1.7405433654785156, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8787670731544495, + "num_tokens": 876125755.0, + "step": 22962 + }, + { + "epoch": 2.9211296272738836, + "grad_norm": 1.617980718612671, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8787758946418762, + "num_tokens": 876162806.0, + "step": 22963 + }, + { + "epoch": 2.921256837552474, + "grad_norm": 1.5499008893966675, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8908286094665527, + "num_tokens": 876203729.0, + "step": 22964 + }, + { + "epoch": 2.9213840478310646, + "grad_norm": 1.5768617391586304, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8876560926437378, + "num_tokens": 876238569.0, + "step": 22965 + }, + { + "epoch": 2.921511258109655, + "grad_norm": 1.5671416521072388, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8863406181335449, + "num_tokens": 876275961.0, + "step": 22966 + }, + { + "epoch": 2.9216384683882457, + "grad_norm": 1.5577820539474487, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8906714916229248, + "num_tokens": 876315141.0, + "step": 22967 + }, + { + "epoch": 2.9217656786668362, + "grad_norm": 1.579676628112793, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.888830304145813, + "num_tokens": 876353319.0, + "step": 22968 + }, + { + "epoch": 2.9218928889454268, + "grad_norm": 1.598140835762024, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8722217082977295, + "num_tokens": 876389549.0, + "step": 22969 + }, + { + "epoch": 2.9220200992240173, + "grad_norm": 1.499851107597351, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8896613121032715, + "num_tokens": 876430606.0, + "step": 22970 + }, + { + "epoch": 2.922147309502608, + "grad_norm": 1.5197542905807495, + "learning_rate": 1e-06, + "loss": 0.2616, + "mean_token_accuracy": 0.9067904949188232, + "num_tokens": 876463968.0, + "step": 22971 + }, + { + "epoch": 2.9222745197811983, + "grad_norm": 1.527499794960022, + "learning_rate": 1e-06, + "loss": 0.2885, + "mean_token_accuracy": 0.8949766159057617, + "num_tokens": 876496375.0, + "step": 22972 + }, + { + "epoch": 2.922401730059789, + "grad_norm": 1.457306146621704, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8987561464309692, + "num_tokens": 876532921.0, + "step": 22973 + }, + { + "epoch": 2.9225289403383794, + "grad_norm": 1.5068771839141846, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8792984485626221, + "num_tokens": 876573468.0, + "step": 22974 + }, + { + "epoch": 2.92265615061697, + "grad_norm": 1.6073075532913208, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8862094879150391, + "num_tokens": 876610734.0, + "step": 22975 + }, + { + "epoch": 2.9227833608955605, + "grad_norm": 1.4551427364349365, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8805137872695923, + "num_tokens": 876653450.0, + "step": 22976 + }, + { + "epoch": 2.922910571174151, + "grad_norm": 1.7569992542266846, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8753072023391724, + "num_tokens": 876686783.0, + "step": 22977 + }, + { + "epoch": 2.9230377814527415, + "grad_norm": 1.5017622709274292, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8898137807846069, + "num_tokens": 876725488.0, + "step": 22978 + }, + { + "epoch": 2.923164991731332, + "grad_norm": 1.5719940662384033, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8970698118209839, + "num_tokens": 876760450.0, + "step": 22979 + }, + { + "epoch": 2.9232922020099226, + "grad_norm": 1.347739338874817, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8997551202774048, + "num_tokens": 876802121.0, + "step": 22980 + }, + { + "epoch": 2.9234194122885127, + "grad_norm": 1.4347426891326904, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8883047699928284, + "num_tokens": 876843910.0, + "step": 22981 + }, + { + "epoch": 2.9235466225671036, + "grad_norm": 1.6655163764953613, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8806492686271667, + "num_tokens": 876879697.0, + "step": 22982 + }, + { + "epoch": 2.9236738328456937, + "grad_norm": 1.5550858974456787, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.890468180179596, + "num_tokens": 876917085.0, + "step": 22983 + }, + { + "epoch": 2.9238010431242847, + "grad_norm": 1.5370771884918213, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8885840177536011, + "num_tokens": 876953358.0, + "step": 22984 + }, + { + "epoch": 2.9239282534028748, + "grad_norm": 1.5710687637329102, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8744040727615356, + "num_tokens": 876992503.0, + "step": 22985 + }, + { + "epoch": 2.9240554636814657, + "grad_norm": 1.746670126914978, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8847121000289917, + "num_tokens": 877024673.0, + "step": 22986 + }, + { + "epoch": 2.924182673960056, + "grad_norm": 1.5128002166748047, + "learning_rate": 1e-06, + "loss": 0.2676, + "mean_token_accuracy": 0.9022801518440247, + "num_tokens": 877060813.0, + "step": 22987 + }, + { + "epoch": 2.9243098842386464, + "grad_norm": 1.5363094806671143, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8865170478820801, + "num_tokens": 877099011.0, + "step": 22988 + }, + { + "epoch": 2.924437094517237, + "grad_norm": 1.463706135749817, + "learning_rate": 1e-06, + "loss": 0.2658, + "mean_token_accuracy": 0.9015750288963318, + "num_tokens": 877136927.0, + "step": 22989 + }, + { + "epoch": 2.9245643047958274, + "grad_norm": 1.5486295223236084, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8806604146957397, + "num_tokens": 877175081.0, + "step": 22990 + }, + { + "epoch": 2.924691515074418, + "grad_norm": 1.608044147491455, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8783323764801025, + "num_tokens": 877209991.0, + "step": 22991 + }, + { + "epoch": 2.9248187253530085, + "grad_norm": 1.7292201519012451, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8771604299545288, + "num_tokens": 877244186.0, + "step": 22992 + }, + { + "epoch": 2.924945935631599, + "grad_norm": 1.769018292427063, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8821936845779419, + "num_tokens": 877276353.0, + "step": 22993 + }, + { + "epoch": 2.9250731459101895, + "grad_norm": 1.5288130044937134, + "learning_rate": 1e-06, + "loss": 0.2745, + "mean_token_accuracy": 0.900917649269104, + "num_tokens": 877313032.0, + "step": 22994 + }, + { + "epoch": 2.92520035618878, + "grad_norm": 1.411378026008606, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8872657418251038, + "num_tokens": 877357258.0, + "step": 22995 + }, + { + "epoch": 2.9253275664673706, + "grad_norm": 1.5717244148254395, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8816165924072266, + "num_tokens": 877397318.0, + "step": 22996 + }, + { + "epoch": 2.925454776745961, + "grad_norm": 1.697027325630188, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8706762194633484, + "num_tokens": 877434352.0, + "step": 22997 + }, + { + "epoch": 2.9255819870245516, + "grad_norm": 1.4056336879730225, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8930234909057617, + "num_tokens": 877475347.0, + "step": 22998 + }, + { + "epoch": 2.925709197303142, + "grad_norm": 1.624106764793396, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8753529787063599, + "num_tokens": 877516676.0, + "step": 22999 + }, + { + "epoch": 2.9258364075817327, + "grad_norm": 1.6134059429168701, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8830053806304932, + "num_tokens": 877553501.0, + "step": 23000 + }, + { + "epoch": 2.925963617860323, + "grad_norm": 1.4838794469833374, + "learning_rate": 1e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.8975353837013245, + "num_tokens": 877590917.0, + "step": 23001 + }, + { + "epoch": 2.9260908281389137, + "grad_norm": 1.5897859334945679, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8859084844589233, + "num_tokens": 877626633.0, + "step": 23002 + }, + { + "epoch": 2.9262180384175043, + "grad_norm": 1.5268467664718628, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8681557774543762, + "num_tokens": 877667083.0, + "step": 23003 + }, + { + "epoch": 2.926345248696095, + "grad_norm": 1.655351161956787, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8890340924263, + "num_tokens": 877699562.0, + "step": 23004 + }, + { + "epoch": 2.9264724589746853, + "grad_norm": 1.5170985460281372, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8936320543289185, + "num_tokens": 877738797.0, + "step": 23005 + }, + { + "epoch": 2.9265996692532754, + "grad_norm": 1.7540748119354248, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8689131736755371, + "num_tokens": 877773215.0, + "step": 23006 + }, + { + "epoch": 2.9267268795318664, + "grad_norm": 1.5510549545288086, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8850803971290588, + "num_tokens": 877813762.0, + "step": 23007 + }, + { + "epoch": 2.9268540898104565, + "grad_norm": 1.6313949823379517, + "learning_rate": 1e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.895929753780365, + "num_tokens": 877848797.0, + "step": 23008 + }, + { + "epoch": 2.9269813000890474, + "grad_norm": 1.6512207984924316, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8732837438583374, + "num_tokens": 877887022.0, + "step": 23009 + }, + { + "epoch": 2.9271085103676375, + "grad_norm": 1.5080088376998901, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8844496607780457, + "num_tokens": 877928318.0, + "step": 23010 + }, + { + "epoch": 2.927235720646228, + "grad_norm": 1.8045936822891235, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8714561462402344, + "num_tokens": 877961220.0, + "step": 23011 + }, + { + "epoch": 2.9273629309248186, + "grad_norm": 1.5874342918395996, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8716392517089844, + "num_tokens": 878002438.0, + "step": 23012 + }, + { + "epoch": 2.927490141203409, + "grad_norm": 1.4868654012680054, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8802826404571533, + "num_tokens": 878044664.0, + "step": 23013 + }, + { + "epoch": 2.9276173514819996, + "grad_norm": 1.5721393823623657, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8802074193954468, + "num_tokens": 878085932.0, + "step": 23014 + }, + { + "epoch": 2.92774456176059, + "grad_norm": 1.4908456802368164, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8839139938354492, + "num_tokens": 878127267.0, + "step": 23015 + }, + { + "epoch": 2.9278717720391807, + "grad_norm": 1.6023558378219604, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8840311765670776, + "num_tokens": 878164116.0, + "step": 23016 + }, + { + "epoch": 2.9279989823177712, + "grad_norm": 1.5920857191085815, + "learning_rate": 1e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.8950819969177246, + "num_tokens": 878200174.0, + "step": 23017 + }, + { + "epoch": 2.9281261925963618, + "grad_norm": 1.5425134897232056, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8796259760856628, + "num_tokens": 878239854.0, + "step": 23018 + }, + { + "epoch": 2.9282534028749523, + "grad_norm": 1.4256315231323242, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.884669303894043, + "num_tokens": 878283859.0, + "step": 23019 + }, + { + "epoch": 2.928380613153543, + "grad_norm": 1.634074091911316, + "learning_rate": 1e-06, + "loss": 0.2641, + "mean_token_accuracy": 0.9038745164871216, + "num_tokens": 878318653.0, + "step": 23020 + }, + { + "epoch": 2.9285078234321333, + "grad_norm": 1.464775800704956, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8880143165588379, + "num_tokens": 878360616.0, + "step": 23021 + }, + { + "epoch": 2.928635033710724, + "grad_norm": 1.663967251777649, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8829824328422546, + "num_tokens": 878393550.0, + "step": 23022 + }, + { + "epoch": 2.9287622439893144, + "grad_norm": 1.53659987449646, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.887233316898346, + "num_tokens": 878434346.0, + "step": 23023 + }, + { + "epoch": 2.928889454267905, + "grad_norm": 1.5487061738967896, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8807516098022461, + "num_tokens": 878469737.0, + "step": 23024 + }, + { + "epoch": 2.9290166645464955, + "grad_norm": 1.5463366508483887, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8863809108734131, + "num_tokens": 878506428.0, + "step": 23025 + }, + { + "epoch": 2.929143874825086, + "grad_norm": 1.4565064907073975, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8919633626937866, + "num_tokens": 878547235.0, + "step": 23026 + }, + { + "epoch": 2.9292710851036765, + "grad_norm": 1.5110583305358887, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8880846500396729, + "num_tokens": 878587796.0, + "step": 23027 + }, + { + "epoch": 2.929398295382267, + "grad_norm": 1.4625805616378784, + "learning_rate": 1e-06, + "loss": 0.295, + "mean_token_accuracy": 0.8914781212806702, + "num_tokens": 878625401.0, + "step": 23028 + }, + { + "epoch": 2.929525505660857, + "grad_norm": 1.4927424192428589, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8922750949859619, + "num_tokens": 878662381.0, + "step": 23029 + }, + { + "epoch": 2.929652715939448, + "grad_norm": 1.5770964622497559, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8698611259460449, + "num_tokens": 878701705.0, + "step": 23030 + }, + { + "epoch": 2.929779926218038, + "grad_norm": 1.5273486375808716, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8918300271034241, + "num_tokens": 878738151.0, + "step": 23031 + }, + { + "epoch": 2.929907136496629, + "grad_norm": 1.7573305368423462, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8893205523490906, + "num_tokens": 878769039.0, + "step": 23032 + }, + { + "epoch": 2.9300343467752192, + "grad_norm": 1.6501960754394531, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8885254859924316, + "num_tokens": 878804509.0, + "step": 23033 + }, + { + "epoch": 2.93016155705381, + "grad_norm": 1.659692645072937, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8853389620780945, + "num_tokens": 878840569.0, + "step": 23034 + }, + { + "epoch": 2.9302887673324003, + "grad_norm": 1.6714279651641846, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8931982517242432, + "num_tokens": 878874836.0, + "step": 23035 + }, + { + "epoch": 2.930415977610991, + "grad_norm": 1.5625358819961548, + "learning_rate": 1e-06, + "loss": 0.2667, + "mean_token_accuracy": 0.9019686579704285, + "num_tokens": 878908002.0, + "step": 23036 + }, + { + "epoch": 2.9305431878895813, + "grad_norm": 1.5979546308517456, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8818234205245972, + "num_tokens": 878946168.0, + "step": 23037 + }, + { + "epoch": 2.930670398168172, + "grad_norm": 1.5466468334197998, + "learning_rate": 1e-06, + "loss": 0.2866, + "mean_token_accuracy": 0.892922043800354, + "num_tokens": 878982987.0, + "step": 23038 + }, + { + "epoch": 2.9307976084467624, + "grad_norm": 1.4643768072128296, + "learning_rate": 1e-06, + "loss": 0.287, + "mean_token_accuracy": 0.8937957286834717, + "num_tokens": 879021314.0, + "step": 23039 + }, + { + "epoch": 2.930924818725353, + "grad_norm": 1.7249391078948975, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8742843270301819, + "num_tokens": 879054400.0, + "step": 23040 + }, + { + "epoch": 2.9310520290039435, + "grad_norm": 1.5739037990570068, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8835201859474182, + "num_tokens": 879091899.0, + "step": 23041 + }, + { + "epoch": 2.931179239282534, + "grad_norm": 1.5762858390808105, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8815765380859375, + "num_tokens": 879131990.0, + "step": 23042 + }, + { + "epoch": 2.9313064495611245, + "grad_norm": 1.6392829418182373, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8772119283676147, + "num_tokens": 879170452.0, + "step": 23043 + }, + { + "epoch": 2.931433659839715, + "grad_norm": 1.4696027040481567, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8882284164428711, + "num_tokens": 879210301.0, + "step": 23044 + }, + { + "epoch": 2.9315608701183056, + "grad_norm": 1.6158630847930908, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8849889636039734, + "num_tokens": 879253152.0, + "step": 23045 + }, + { + "epoch": 2.931688080396896, + "grad_norm": 1.6876157522201538, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8781127333641052, + "num_tokens": 879289401.0, + "step": 23046 + }, + { + "epoch": 2.9318152906754866, + "grad_norm": 1.5980144739151, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8949293494224548, + "num_tokens": 879329259.0, + "step": 23047 + }, + { + "epoch": 2.931942500954077, + "grad_norm": 1.6080862283706665, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8774962425231934, + "num_tokens": 879365725.0, + "step": 23048 + }, + { + "epoch": 2.9320697112326677, + "grad_norm": 1.7110414505004883, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.880180835723877, + "num_tokens": 879398499.0, + "step": 23049 + }, + { + "epoch": 2.932196921511258, + "grad_norm": 1.6880052089691162, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8819150924682617, + "num_tokens": 879434139.0, + "step": 23050 + }, + { + "epoch": 2.9323241317898487, + "grad_norm": 1.491851568222046, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.89014732837677, + "num_tokens": 879474198.0, + "step": 23051 + }, + { + "epoch": 2.9324513420684393, + "grad_norm": 1.683289647102356, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8760476112365723, + "num_tokens": 879516643.0, + "step": 23052 + }, + { + "epoch": 2.93257855234703, + "grad_norm": 1.5901395082473755, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8787136077880859, + "num_tokens": 879553185.0, + "step": 23053 + }, + { + "epoch": 2.93270576262562, + "grad_norm": 1.5264172554016113, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8861210942268372, + "num_tokens": 879590343.0, + "step": 23054 + }, + { + "epoch": 2.932832972904211, + "grad_norm": 1.6525980234146118, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8716382384300232, + "num_tokens": 879626297.0, + "step": 23055 + }, + { + "epoch": 2.932960183182801, + "grad_norm": 1.498355507850647, + "learning_rate": 1e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.90093994140625, + "num_tokens": 879663014.0, + "step": 23056 + }, + { + "epoch": 2.933087393461392, + "grad_norm": 1.5198192596435547, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8671730756759644, + "num_tokens": 879706261.0, + "step": 23057 + }, + { + "epoch": 2.933214603739982, + "grad_norm": 1.6643986701965332, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8933267593383789, + "num_tokens": 879736205.0, + "step": 23058 + }, + { + "epoch": 2.933341814018573, + "grad_norm": 1.4924242496490479, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8917191624641418, + "num_tokens": 879774554.0, + "step": 23059 + }, + { + "epoch": 2.933469024297163, + "grad_norm": 1.640905737876892, + "learning_rate": 1e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.8961353302001953, + "num_tokens": 879805318.0, + "step": 23060 + }, + { + "epoch": 2.9335962345757536, + "grad_norm": 1.6318892240524292, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8591977953910828, + "num_tokens": 879849107.0, + "step": 23061 + }, + { + "epoch": 2.933723444854344, + "grad_norm": 1.5471856594085693, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8836464285850525, + "num_tokens": 879887731.0, + "step": 23062 + }, + { + "epoch": 2.9338506551329346, + "grad_norm": 1.4461793899536133, + "learning_rate": 1e-06, + "loss": 0.2729, + "mean_token_accuracy": 0.901146650314331, + "num_tokens": 879926440.0, + "step": 23063 + }, + { + "epoch": 2.933977865411525, + "grad_norm": 1.6326470375061035, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8797922134399414, + "num_tokens": 879965281.0, + "step": 23064 + }, + { + "epoch": 2.9341050756901157, + "grad_norm": 1.5231897830963135, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8805384039878845, + "num_tokens": 880009552.0, + "step": 23065 + }, + { + "epoch": 2.934232285968706, + "grad_norm": 1.4750250577926636, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.8918116688728333, + "num_tokens": 880049290.0, + "step": 23066 + }, + { + "epoch": 2.9343594962472968, + "grad_norm": 1.601905107498169, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8839390277862549, + "num_tokens": 880089592.0, + "step": 23067 + }, + { + "epoch": 2.9344867065258873, + "grad_norm": 1.5055789947509766, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8710037469863892, + "num_tokens": 880131292.0, + "step": 23068 + }, + { + "epoch": 2.934613916804478, + "grad_norm": 1.8208338022232056, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.879145622253418, + "num_tokens": 880165329.0, + "step": 23069 + }, + { + "epoch": 2.9347411270830683, + "grad_norm": 1.5724698305130005, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8865069150924683, + "num_tokens": 880201870.0, + "step": 23070 + }, + { + "epoch": 2.934868337361659, + "grad_norm": 1.3145262002944946, + "learning_rate": 1e-06, + "loss": 0.271, + "mean_token_accuracy": 0.901013970375061, + "num_tokens": 880245340.0, + "step": 23071 + }, + { + "epoch": 2.9349955476402494, + "grad_norm": 1.6564767360687256, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.886583685874939, + "num_tokens": 880278570.0, + "step": 23072 + }, + { + "epoch": 2.93512275791884, + "grad_norm": 1.4394044876098633, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.8918651938438416, + "num_tokens": 880319779.0, + "step": 23073 + }, + { + "epoch": 2.9352499681974304, + "grad_norm": 1.586538553237915, + "learning_rate": 1e-06, + "loss": 0.2804, + "mean_token_accuracy": 0.897153913974762, + "num_tokens": 880351675.0, + "step": 23074 + }, + { + "epoch": 2.935377178476021, + "grad_norm": 1.4741114377975464, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.882625937461853, + "num_tokens": 880394922.0, + "step": 23075 + }, + { + "epoch": 2.9355043887546115, + "grad_norm": 1.5512287616729736, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8866457939147949, + "num_tokens": 880434726.0, + "step": 23076 + }, + { + "epoch": 2.935631599033202, + "grad_norm": 1.6775357723236084, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8782296180725098, + "num_tokens": 880469719.0, + "step": 23077 + }, + { + "epoch": 2.9357588093117926, + "grad_norm": 1.4480552673339844, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8867014646530151, + "num_tokens": 880512546.0, + "step": 23078 + }, + { + "epoch": 2.9358860195903826, + "grad_norm": 1.495853066444397, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8911056518554688, + "num_tokens": 880548198.0, + "step": 23079 + }, + { + "epoch": 2.9360132298689736, + "grad_norm": 1.5194669961929321, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8806557655334473, + "num_tokens": 880588269.0, + "step": 23080 + }, + { + "epoch": 2.9361404401475637, + "grad_norm": 1.6791094541549683, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.874108076095581, + "num_tokens": 880623886.0, + "step": 23081 + }, + { + "epoch": 2.9362676504261547, + "grad_norm": 1.7515450716018677, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8755717277526855, + "num_tokens": 880662799.0, + "step": 23082 + }, + { + "epoch": 2.9363948607047448, + "grad_norm": 1.7185431718826294, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8711305856704712, + "num_tokens": 880696690.0, + "step": 23083 + }, + { + "epoch": 2.9365220709833357, + "grad_norm": 1.4638793468475342, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8808810710906982, + "num_tokens": 880740138.0, + "step": 23084 + }, + { + "epoch": 2.936649281261926, + "grad_norm": 1.540826678276062, + "learning_rate": 1e-06, + "loss": 0.2802, + "mean_token_accuracy": 0.8959130048751831, + "num_tokens": 880776242.0, + "step": 23085 + }, + { + "epoch": 2.9367764915405163, + "grad_norm": 1.5309131145477295, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8834143877029419, + "num_tokens": 880816322.0, + "step": 23086 + }, + { + "epoch": 2.936903701819107, + "grad_norm": 1.5690699815750122, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.889705240726471, + "num_tokens": 880853427.0, + "step": 23087 + }, + { + "epoch": 2.9370309120976974, + "grad_norm": 1.7310644388198853, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8958797454833984, + "num_tokens": 880889418.0, + "step": 23088 + }, + { + "epoch": 2.937158122376288, + "grad_norm": 1.4889856576919556, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8868479132652283, + "num_tokens": 880931737.0, + "step": 23089 + }, + { + "epoch": 2.9372853326548785, + "grad_norm": 1.439841628074646, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8935685157775879, + "num_tokens": 880969920.0, + "step": 23090 + }, + { + "epoch": 2.937412542933469, + "grad_norm": 1.573879361152649, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8879379034042358, + "num_tokens": 881008065.0, + "step": 23091 + }, + { + "epoch": 2.9375397532120595, + "grad_norm": 1.6668999195098877, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.879562497138977, + "num_tokens": 881047674.0, + "step": 23092 + }, + { + "epoch": 2.93766696349065, + "grad_norm": 1.655653953552246, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8767494559288025, + "num_tokens": 881087329.0, + "step": 23093 + }, + { + "epoch": 2.9377941737692406, + "grad_norm": 1.5109158754348755, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8902840614318848, + "num_tokens": 881125738.0, + "step": 23094 + }, + { + "epoch": 2.937921384047831, + "grad_norm": 1.6593706607818604, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8778291940689087, + "num_tokens": 881170452.0, + "step": 23095 + }, + { + "epoch": 2.9380485943264216, + "grad_norm": 1.7109688520431519, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8713271021842957, + "num_tokens": 881208850.0, + "step": 23096 + }, + { + "epoch": 2.938175804605012, + "grad_norm": 1.4179311990737915, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8928605914115906, + "num_tokens": 881248921.0, + "step": 23097 + }, + { + "epoch": 2.9383030148836027, + "grad_norm": 1.6364789009094238, + "learning_rate": 1e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.8952627182006836, + "num_tokens": 881286217.0, + "step": 23098 + }, + { + "epoch": 2.938430225162193, + "grad_norm": 1.5063729286193848, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8801816701889038, + "num_tokens": 881329550.0, + "step": 23099 + }, + { + "epoch": 2.9385574354407837, + "grad_norm": 1.5919404029846191, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8779348134994507, + "num_tokens": 881366959.0, + "step": 23100 + }, + { + "epoch": 2.9386846457193743, + "grad_norm": 1.496120572090149, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8842154741287231, + "num_tokens": 881409239.0, + "step": 23101 + }, + { + "epoch": 2.938811855997965, + "grad_norm": 1.4720594882965088, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8908087015151978, + "num_tokens": 881448065.0, + "step": 23102 + }, + { + "epoch": 2.9389390662765553, + "grad_norm": 1.4767076969146729, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8828424215316772, + "num_tokens": 881487486.0, + "step": 23103 + }, + { + "epoch": 2.9390662765551454, + "grad_norm": 1.5894901752471924, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.88462233543396, + "num_tokens": 881525981.0, + "step": 23104 + }, + { + "epoch": 2.9391934868337364, + "grad_norm": 1.5161256790161133, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8773238658905029, + "num_tokens": 881570815.0, + "step": 23105 + }, + { + "epoch": 2.9393206971123265, + "grad_norm": 1.527834177017212, + "learning_rate": 1e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.8951188325881958, + "num_tokens": 881611990.0, + "step": 23106 + }, + { + "epoch": 2.9394479073909174, + "grad_norm": 1.4609591960906982, + "learning_rate": 1e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.8920996189117432, + "num_tokens": 881650116.0, + "step": 23107 + }, + { + "epoch": 2.9395751176695075, + "grad_norm": 1.4680649042129517, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.887313187122345, + "num_tokens": 881690848.0, + "step": 23108 + }, + { + "epoch": 2.939702327948098, + "grad_norm": 1.6917403936386108, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8840495944023132, + "num_tokens": 881725467.0, + "step": 23109 + }, + { + "epoch": 2.9398295382266886, + "grad_norm": 1.4280186891555786, + "learning_rate": 1e-06, + "loss": 0.249, + "mean_token_accuracy": 0.9082241058349609, + "num_tokens": 881765525.0, + "step": 23110 + }, + { + "epoch": 2.939956748505279, + "grad_norm": 1.5454760789871216, + "learning_rate": 1e-06, + "loss": 0.2715, + "mean_token_accuracy": 0.9026259183883667, + "num_tokens": 881801137.0, + "step": 23111 + }, + { + "epoch": 2.9400839587838696, + "grad_norm": 1.5354832410812378, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8874692320823669, + "num_tokens": 881838833.0, + "step": 23112 + }, + { + "epoch": 2.94021116906246, + "grad_norm": 1.5259554386138916, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.8967543244361877, + "num_tokens": 881877115.0, + "step": 23113 + }, + { + "epoch": 2.9403383793410507, + "grad_norm": 1.575682282447815, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8784809708595276, + "num_tokens": 881916120.0, + "step": 23114 + }, + { + "epoch": 2.940465589619641, + "grad_norm": 1.502018690109253, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8926381468772888, + "num_tokens": 881956486.0, + "step": 23115 + }, + { + "epoch": 2.9405927998982317, + "grad_norm": 1.579833984375, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8852112889289856, + "num_tokens": 881990783.0, + "step": 23116 + }, + { + "epoch": 2.9407200101768223, + "grad_norm": 1.4081494808197021, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8803528547286987, + "num_tokens": 882036226.0, + "step": 23117 + }, + { + "epoch": 2.940847220455413, + "grad_norm": 1.535539984703064, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8988312482833862, + "num_tokens": 882074729.0, + "step": 23118 + }, + { + "epoch": 2.9409744307340033, + "grad_norm": 1.4733881950378418, + "learning_rate": 1e-06, + "loss": 0.2687, + "mean_token_accuracy": 0.9030935764312744, + "num_tokens": 882114069.0, + "step": 23119 + }, + { + "epoch": 2.941101641012594, + "grad_norm": 1.544696569442749, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8822067379951477, + "num_tokens": 882154911.0, + "step": 23120 + }, + { + "epoch": 2.9412288512911844, + "grad_norm": 1.6607115268707275, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8867619633674622, + "num_tokens": 882189257.0, + "step": 23121 + }, + { + "epoch": 2.941356061569775, + "grad_norm": 1.535334587097168, + "learning_rate": 1e-06, + "loss": 0.2609, + "mean_token_accuracy": 0.9035338163375854, + "num_tokens": 882221103.0, + "step": 23122 + }, + { + "epoch": 2.9414832718483654, + "grad_norm": 1.4604469537734985, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8898831009864807, + "num_tokens": 882262333.0, + "step": 23123 + }, + { + "epoch": 2.941610482126956, + "grad_norm": 1.463204026222229, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8862899541854858, + "num_tokens": 882303827.0, + "step": 23124 + }, + { + "epoch": 2.9417376924055465, + "grad_norm": 1.5978704690933228, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8775616884231567, + "num_tokens": 882342912.0, + "step": 23125 + }, + { + "epoch": 2.941864902684137, + "grad_norm": 1.781539797782898, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.86359703540802, + "num_tokens": 882377660.0, + "step": 23126 + }, + { + "epoch": 2.941992112962727, + "grad_norm": 1.5169600248336792, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8885258436203003, + "num_tokens": 882417688.0, + "step": 23127 + }, + { + "epoch": 2.942119323241318, + "grad_norm": 1.6233222484588623, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8793193101882935, + "num_tokens": 882456790.0, + "step": 23128 + }, + { + "epoch": 2.942246533519908, + "grad_norm": 1.6236683130264282, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8695282936096191, + "num_tokens": 882496803.0, + "step": 23129 + }, + { + "epoch": 2.942373743798499, + "grad_norm": 1.5323615074157715, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8808209896087646, + "num_tokens": 882536863.0, + "step": 23130 + }, + { + "epoch": 2.9425009540770892, + "grad_norm": 1.5713378190994263, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8812922239303589, + "num_tokens": 882574489.0, + "step": 23131 + }, + { + "epoch": 2.94262816435568, + "grad_norm": 1.569993495941162, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8785904049873352, + "num_tokens": 882611688.0, + "step": 23132 + }, + { + "epoch": 2.9427553746342703, + "grad_norm": 1.5712263584136963, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8723033666610718, + "num_tokens": 882650758.0, + "step": 23133 + }, + { + "epoch": 2.942882584912861, + "grad_norm": 1.6103270053863525, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.870979905128479, + "num_tokens": 882688972.0, + "step": 23134 + }, + { + "epoch": 2.9430097951914513, + "grad_norm": 1.650429606437683, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8884175419807434, + "num_tokens": 882723140.0, + "step": 23135 + }, + { + "epoch": 2.943137005470042, + "grad_norm": 1.4963513612747192, + "learning_rate": 1e-06, + "loss": 0.2679, + "mean_token_accuracy": 0.9003615975379944, + "num_tokens": 882756187.0, + "step": 23136 + }, + { + "epoch": 2.9432642157486324, + "grad_norm": 1.412289023399353, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8808797001838684, + "num_tokens": 882800419.0, + "step": 23137 + }, + { + "epoch": 2.943391426027223, + "grad_norm": 1.498763084411621, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8884904384613037, + "num_tokens": 882838104.0, + "step": 23138 + }, + { + "epoch": 2.9435186363058135, + "grad_norm": 1.5218812227249146, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8805505037307739, + "num_tokens": 882877299.0, + "step": 23139 + }, + { + "epoch": 2.943645846584404, + "grad_norm": 1.5942620038986206, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8901109099388123, + "num_tokens": 882909771.0, + "step": 23140 + }, + { + "epoch": 2.9437730568629945, + "grad_norm": 1.6219531297683716, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8757219910621643, + "num_tokens": 882946370.0, + "step": 23141 + }, + { + "epoch": 2.943900267141585, + "grad_norm": 1.6928765773773193, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8821484446525574, + "num_tokens": 882980454.0, + "step": 23142 + }, + { + "epoch": 2.9440274774201756, + "grad_norm": 1.5150245428085327, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8887020349502563, + "num_tokens": 883022904.0, + "step": 23143 + }, + { + "epoch": 2.944154687698766, + "grad_norm": 1.5401146411895752, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8916187882423401, + "num_tokens": 883059876.0, + "step": 23144 + }, + { + "epoch": 2.9442818979773566, + "grad_norm": 1.524893045425415, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8752555847167969, + "num_tokens": 883099090.0, + "step": 23145 + }, + { + "epoch": 2.944409108255947, + "grad_norm": 1.6460978984832764, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8842655420303345, + "num_tokens": 883136509.0, + "step": 23146 + }, + { + "epoch": 2.9445363185345377, + "grad_norm": 1.48580002784729, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8885447978973389, + "num_tokens": 883176422.0, + "step": 23147 + }, + { + "epoch": 2.944663528813128, + "grad_norm": 1.5021839141845703, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8813136219978333, + "num_tokens": 883219226.0, + "step": 23148 + }, + { + "epoch": 2.9447907390917187, + "grad_norm": 1.5735692977905273, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8820856213569641, + "num_tokens": 883256957.0, + "step": 23149 + }, + { + "epoch": 2.9449179493703093, + "grad_norm": 1.6762871742248535, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8839666843414307, + "num_tokens": 883292324.0, + "step": 23150 + }, + { + "epoch": 2.9450451596489, + "grad_norm": 1.5395727157592773, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8857722282409668, + "num_tokens": 883335631.0, + "step": 23151 + }, + { + "epoch": 2.94517236992749, + "grad_norm": 1.4582077264785767, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8838949799537659, + "num_tokens": 883376986.0, + "step": 23152 + }, + { + "epoch": 2.945299580206081, + "grad_norm": 1.5436996221542358, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8903750777244568, + "num_tokens": 883412071.0, + "step": 23153 + }, + { + "epoch": 2.945426790484671, + "grad_norm": 1.5273759365081787, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8742728233337402, + "num_tokens": 883455469.0, + "step": 23154 + }, + { + "epoch": 2.945554000763262, + "grad_norm": 1.6964843273162842, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8788621425628662, + "num_tokens": 883489500.0, + "step": 23155 + }, + { + "epoch": 2.945681211041852, + "grad_norm": 1.595868468284607, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8881230354309082, + "num_tokens": 883524258.0, + "step": 23156 + }, + { + "epoch": 2.945808421320443, + "grad_norm": 1.5009270906448364, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8859508633613586, + "num_tokens": 883565059.0, + "step": 23157 + }, + { + "epoch": 2.945935631599033, + "grad_norm": 1.493143081665039, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8756282329559326, + "num_tokens": 883606340.0, + "step": 23158 + }, + { + "epoch": 2.9460628418776236, + "grad_norm": 1.6735938787460327, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.874366819858551, + "num_tokens": 883643519.0, + "step": 23159 + }, + { + "epoch": 2.946190052156214, + "grad_norm": 1.6470521688461304, + "learning_rate": 1e-06, + "loss": 0.2754, + "mean_token_accuracy": 0.8995811939239502, + "num_tokens": 883675651.0, + "step": 23160 + }, + { + "epoch": 2.9463172624348046, + "grad_norm": 1.5088039636611938, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8847944736480713, + "num_tokens": 883714611.0, + "step": 23161 + }, + { + "epoch": 2.946444472713395, + "grad_norm": 1.4177082777023315, + "learning_rate": 1e-06, + "loss": 0.2606, + "mean_token_accuracy": 0.9040040969848633, + "num_tokens": 883754066.0, + "step": 23162 + }, + { + "epoch": 2.9465716829919857, + "grad_norm": 1.5155411958694458, + "learning_rate": 1e-06, + "loss": 0.2902, + "mean_token_accuracy": 0.8943727612495422, + "num_tokens": 883790845.0, + "step": 23163 + }, + { + "epoch": 2.946698893270576, + "grad_norm": 1.7826125621795654, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.860392689704895, + "num_tokens": 883827327.0, + "step": 23164 + }, + { + "epoch": 2.9468261035491667, + "grad_norm": 1.6434394121170044, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8889226913452148, + "num_tokens": 883861683.0, + "step": 23165 + }, + { + "epoch": 2.9469533138277573, + "grad_norm": 1.5616872310638428, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8820571899414062, + "num_tokens": 883900747.0, + "step": 23166 + }, + { + "epoch": 2.947080524106348, + "grad_norm": 1.593843936920166, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8790602684020996, + "num_tokens": 883938290.0, + "step": 23167 + }, + { + "epoch": 2.9472077343849383, + "grad_norm": 1.6431901454925537, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8832306861877441, + "num_tokens": 883974611.0, + "step": 23168 + }, + { + "epoch": 2.947334944663529, + "grad_norm": 1.5803228616714478, + "learning_rate": 1e-06, + "loss": 0.2769, + "mean_token_accuracy": 0.8986861705780029, + "num_tokens": 884007142.0, + "step": 23169 + }, + { + "epoch": 2.9474621549421194, + "grad_norm": 1.5105035305023193, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8934810161590576, + "num_tokens": 884045030.0, + "step": 23170 + }, + { + "epoch": 2.94758936522071, + "grad_norm": 1.6202969551086426, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.871872067451477, + "num_tokens": 884084827.0, + "step": 23171 + }, + { + "epoch": 2.9477165754993004, + "grad_norm": 1.4743623733520508, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.878591775894165, + "num_tokens": 884127123.0, + "step": 23172 + }, + { + "epoch": 2.947843785777891, + "grad_norm": 1.5377150774002075, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8929493427276611, + "num_tokens": 884164631.0, + "step": 23173 + }, + { + "epoch": 2.9479709960564815, + "grad_norm": 1.5998860597610474, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8862109184265137, + "num_tokens": 884200931.0, + "step": 23174 + }, + { + "epoch": 2.948098206335072, + "grad_norm": 1.6623666286468506, + "learning_rate": 1e-06, + "loss": 0.2685, + "mean_token_accuracy": 0.9024420976638794, + "num_tokens": 884232970.0, + "step": 23175 + }, + { + "epoch": 2.9482254166136626, + "grad_norm": 1.5832915306091309, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8864797949790955, + "num_tokens": 884271196.0, + "step": 23176 + }, + { + "epoch": 2.9483526268922526, + "grad_norm": 1.6502838134765625, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8756453990936279, + "num_tokens": 884305116.0, + "step": 23177 + }, + { + "epoch": 2.9484798371708436, + "grad_norm": 1.6396889686584473, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8760143518447876, + "num_tokens": 884339490.0, + "step": 23178 + }, + { + "epoch": 2.9486070474494337, + "grad_norm": 1.6412523984909058, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8885940313339233, + "num_tokens": 884372382.0, + "step": 23179 + }, + { + "epoch": 2.9487342577280247, + "grad_norm": 1.5498995780944824, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8776500225067139, + "num_tokens": 884414568.0, + "step": 23180 + }, + { + "epoch": 2.9488614680066147, + "grad_norm": 1.4824798107147217, + "learning_rate": 1e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.8997045755386353, + "num_tokens": 884453083.0, + "step": 23181 + }, + { + "epoch": 2.9489886782852053, + "grad_norm": 1.629282832145691, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8721820116043091, + "num_tokens": 884490933.0, + "step": 23182 + }, + { + "epoch": 2.949115888563796, + "grad_norm": 1.6137702465057373, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8824429512023926, + "num_tokens": 884529043.0, + "step": 23183 + }, + { + "epoch": 2.9492430988423863, + "grad_norm": 1.5804036855697632, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8796601295471191, + "num_tokens": 884568123.0, + "step": 23184 + }, + { + "epoch": 2.949370309120977, + "grad_norm": 1.5684255361557007, + "learning_rate": 1e-06, + "loss": 0.2792, + "mean_token_accuracy": 0.8992494344711304, + "num_tokens": 884601702.0, + "step": 23185 + }, + { + "epoch": 2.9494975193995674, + "grad_norm": 1.5017261505126953, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8861101865768433, + "num_tokens": 884642234.0, + "step": 23186 + }, + { + "epoch": 2.949624729678158, + "grad_norm": 1.79690420627594, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.87912917137146, + "num_tokens": 884682398.0, + "step": 23187 + }, + { + "epoch": 2.9497519399567484, + "grad_norm": 1.5187844038009644, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.889570415019989, + "num_tokens": 884722079.0, + "step": 23188 + }, + { + "epoch": 2.949879150235339, + "grad_norm": 1.6568063497543335, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8743605613708496, + "num_tokens": 884758307.0, + "step": 23189 + }, + { + "epoch": 2.9500063605139295, + "grad_norm": 1.487042784690857, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8914662003517151, + "num_tokens": 884798059.0, + "step": 23190 + }, + { + "epoch": 2.95013357079252, + "grad_norm": 1.574283242225647, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8858876824378967, + "num_tokens": 884837385.0, + "step": 23191 + }, + { + "epoch": 2.9502607810711106, + "grad_norm": 1.709688663482666, + "learning_rate": 1e-06, + "loss": 0.2762, + "mean_token_accuracy": 0.8973714113235474, + "num_tokens": 884865436.0, + "step": 23192 + }, + { + "epoch": 2.950387991349701, + "grad_norm": 1.441848874092102, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8905091881752014, + "num_tokens": 884906228.0, + "step": 23193 + }, + { + "epoch": 2.9505152016282916, + "grad_norm": 1.5329065322875977, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8899942636489868, + "num_tokens": 884945649.0, + "step": 23194 + }, + { + "epoch": 2.950642411906882, + "grad_norm": 1.489166021347046, + "learning_rate": 1e-06, + "loss": 0.2689, + "mean_token_accuracy": 0.9029110074043274, + "num_tokens": 884982134.0, + "step": 23195 + }, + { + "epoch": 2.9507696221854727, + "grad_norm": 1.6629337072372437, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8915024995803833, + "num_tokens": 885013117.0, + "step": 23196 + }, + { + "epoch": 2.950896832464063, + "grad_norm": 1.5410809516906738, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8919740915298462, + "num_tokens": 885054939.0, + "step": 23197 + }, + { + "epoch": 2.9510240427426537, + "grad_norm": 1.5602961778640747, + "learning_rate": 1e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.8972128629684448, + "num_tokens": 885087796.0, + "step": 23198 + }, + { + "epoch": 2.9511512530212443, + "grad_norm": 1.802968144416809, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8769724369049072, + "num_tokens": 885120084.0, + "step": 23199 + }, + { + "epoch": 2.951278463299835, + "grad_norm": 1.4319233894348145, + "learning_rate": 1e-06, + "loss": 0.2641, + "mean_token_accuracy": 0.9031764268875122, + "num_tokens": 885159323.0, + "step": 23200 + }, + { + "epoch": 2.9514056735784253, + "grad_norm": 1.6762773990631104, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8762681484222412, + "num_tokens": 885195844.0, + "step": 23201 + }, + { + "epoch": 2.9515328838570154, + "grad_norm": 1.5934430360794067, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.882878303527832, + "num_tokens": 885233291.0, + "step": 23202 + }, + { + "epoch": 2.9516600941356064, + "grad_norm": 1.6807175874710083, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8731729984283447, + "num_tokens": 885268768.0, + "step": 23203 + }, + { + "epoch": 2.9517873044141965, + "grad_norm": 1.6689056158065796, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8848257660865784, + "num_tokens": 885302335.0, + "step": 23204 + }, + { + "epoch": 2.9519145146927874, + "grad_norm": 1.4489026069641113, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8837428092956543, + "num_tokens": 885349253.0, + "step": 23205 + }, + { + "epoch": 2.9520417249713775, + "grad_norm": 1.6309864521026611, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8893564939498901, + "num_tokens": 885384608.0, + "step": 23206 + }, + { + "epoch": 2.952168935249968, + "grad_norm": 1.5969657897949219, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8874426484107971, + "num_tokens": 885421619.0, + "step": 23207 + }, + { + "epoch": 2.9522961455285586, + "grad_norm": 1.6401538848876953, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8860958814620972, + "num_tokens": 885454173.0, + "step": 23208 + }, + { + "epoch": 2.952423355807149, + "grad_norm": 1.5600359439849854, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8898786902427673, + "num_tokens": 885491534.0, + "step": 23209 + }, + { + "epoch": 2.9525505660857396, + "grad_norm": 1.5221331119537354, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8868913650512695, + "num_tokens": 885530262.0, + "step": 23210 + }, + { + "epoch": 2.95267777636433, + "grad_norm": 1.6050913333892822, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8827961683273315, + "num_tokens": 885568672.0, + "step": 23211 + }, + { + "epoch": 2.9528049866429207, + "grad_norm": 1.6158767938613892, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8738491535186768, + "num_tokens": 885604638.0, + "step": 23212 + }, + { + "epoch": 2.952932196921511, + "grad_norm": 1.6222563982009888, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8908158540725708, + "num_tokens": 885638532.0, + "step": 23213 + }, + { + "epoch": 2.9530594072001017, + "grad_norm": 1.422868013381958, + "learning_rate": 1e-06, + "loss": 0.2622, + "mean_token_accuracy": 0.9044585227966309, + "num_tokens": 885679440.0, + "step": 23214 + }, + { + "epoch": 2.9531866174786923, + "grad_norm": 1.5610042810440063, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8920011520385742, + "num_tokens": 885715426.0, + "step": 23215 + }, + { + "epoch": 2.953313827757283, + "grad_norm": 1.7445954084396362, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8825536370277405, + "num_tokens": 885753335.0, + "step": 23216 + }, + { + "epoch": 2.9534410380358733, + "grad_norm": 1.612870693206787, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8768506050109863, + "num_tokens": 885792223.0, + "step": 23217 + }, + { + "epoch": 2.953568248314464, + "grad_norm": 1.579830527305603, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.887397289276123, + "num_tokens": 885827501.0, + "step": 23218 + }, + { + "epoch": 2.9536954585930544, + "grad_norm": 1.4625380039215088, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8954388499259949, + "num_tokens": 885865206.0, + "step": 23219 + }, + { + "epoch": 2.953822668871645, + "grad_norm": 1.6369420289993286, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.883416473865509, + "num_tokens": 885898580.0, + "step": 23220 + }, + { + "epoch": 2.9539498791502354, + "grad_norm": 1.6071503162384033, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8869101405143738, + "num_tokens": 885933673.0, + "step": 23221 + }, + { + "epoch": 2.954077089428826, + "grad_norm": 1.7302801609039307, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8793551325798035, + "num_tokens": 885966577.0, + "step": 23222 + }, + { + "epoch": 2.9542042997074165, + "grad_norm": 1.4125049114227295, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.887305498123169, + "num_tokens": 886012070.0, + "step": 23223 + }, + { + "epoch": 2.954331509986007, + "grad_norm": 1.7583715915679932, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8819025158882141, + "num_tokens": 886045003.0, + "step": 23224 + }, + { + "epoch": 2.954458720264597, + "grad_norm": 1.6829103231430054, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8861042261123657, + "num_tokens": 886080346.0, + "step": 23225 + }, + { + "epoch": 2.954585930543188, + "grad_norm": 1.7648124694824219, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8790758848190308, + "num_tokens": 886119589.0, + "step": 23226 + }, + { + "epoch": 2.954713140821778, + "grad_norm": 1.5500119924545288, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8879938125610352, + "num_tokens": 886154462.0, + "step": 23227 + }, + { + "epoch": 2.954840351100369, + "grad_norm": 1.528572678565979, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8891076445579529, + "num_tokens": 886188437.0, + "step": 23228 + }, + { + "epoch": 2.954967561378959, + "grad_norm": 1.4453635215759277, + "learning_rate": 1e-06, + "loss": 0.2749, + "mean_token_accuracy": 0.8985575437545776, + "num_tokens": 886225347.0, + "step": 23229 + }, + { + "epoch": 2.95509477165755, + "grad_norm": 1.6529755592346191, + "learning_rate": 1e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.8947576284408569, + "num_tokens": 886259981.0, + "step": 23230 + }, + { + "epoch": 2.9552219819361403, + "grad_norm": 1.639320969581604, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8792998790740967, + "num_tokens": 886296672.0, + "step": 23231 + }, + { + "epoch": 2.955349192214731, + "grad_norm": 1.5607706308364868, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.867118775844574, + "num_tokens": 886336891.0, + "step": 23232 + }, + { + "epoch": 2.9554764024933213, + "grad_norm": 1.7517163753509521, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8764280676841736, + "num_tokens": 886370277.0, + "step": 23233 + }, + { + "epoch": 2.955603612771912, + "grad_norm": 1.668533205986023, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8742753863334656, + "num_tokens": 886403792.0, + "step": 23234 + }, + { + "epoch": 2.9557308230505024, + "grad_norm": 1.466909646987915, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8904638290405273, + "num_tokens": 886446528.0, + "step": 23235 + }, + { + "epoch": 2.955858033329093, + "grad_norm": 1.5368067026138306, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8907278776168823, + "num_tokens": 886484391.0, + "step": 23236 + }, + { + "epoch": 2.9559852436076834, + "grad_norm": 1.6103627681732178, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8727967739105225, + "num_tokens": 886524622.0, + "step": 23237 + }, + { + "epoch": 2.956112453886274, + "grad_norm": 1.444312572479248, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.8936504125595093, + "num_tokens": 886566921.0, + "step": 23238 + }, + { + "epoch": 2.9562396641648645, + "grad_norm": 1.6621570587158203, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8758945465087891, + "num_tokens": 886604466.0, + "step": 23239 + }, + { + "epoch": 2.956366874443455, + "grad_norm": 1.5426523685455322, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8840309381484985, + "num_tokens": 886647103.0, + "step": 23240 + }, + { + "epoch": 2.9564940847220456, + "grad_norm": 1.5410956144332886, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8845402002334595, + "num_tokens": 886688189.0, + "step": 23241 + }, + { + "epoch": 2.956621295000636, + "grad_norm": 1.609213948249817, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.894788920879364, + "num_tokens": 886722244.0, + "step": 23242 + }, + { + "epoch": 2.9567485052792266, + "grad_norm": 1.6335281133651733, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8908601403236389, + "num_tokens": 886760360.0, + "step": 23243 + }, + { + "epoch": 2.956875715557817, + "grad_norm": 1.611624836921692, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8633221387863159, + "num_tokens": 886801924.0, + "step": 23244 + }, + { + "epoch": 2.9570029258364077, + "grad_norm": 1.760293960571289, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8839882016181946, + "num_tokens": 886839319.0, + "step": 23245 + }, + { + "epoch": 2.957130136114998, + "grad_norm": 1.4940001964569092, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.894150972366333, + "num_tokens": 886879107.0, + "step": 23246 + }, + { + "epoch": 2.9572573463935887, + "grad_norm": 1.5450228452682495, + "learning_rate": 1e-06, + "loss": 0.292, + "mean_token_accuracy": 0.892058789730072, + "num_tokens": 886912349.0, + "step": 23247 + }, + { + "epoch": 2.9573845566721793, + "grad_norm": 1.7971056699752808, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8842931985855103, + "num_tokens": 886943048.0, + "step": 23248 + }, + { + "epoch": 2.95751176695077, + "grad_norm": 1.5346547365188599, + "learning_rate": 1e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.8974140882492065, + "num_tokens": 886978461.0, + "step": 23249 + }, + { + "epoch": 2.95763897722936, + "grad_norm": 1.7930041551589966, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8951231241226196, + "num_tokens": 887012316.0, + "step": 23250 + }, + { + "epoch": 2.957766187507951, + "grad_norm": 1.6589008569717407, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8885643482208252, + "num_tokens": 887046297.0, + "step": 23251 + }, + { + "epoch": 2.957893397786541, + "grad_norm": 1.5683685541152954, + "learning_rate": 1e-06, + "loss": 0.2702, + "mean_token_accuracy": 0.8985955715179443, + "num_tokens": 887078648.0, + "step": 23252 + }, + { + "epoch": 2.958020608065132, + "grad_norm": 1.4760957956314087, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8954508304595947, + "num_tokens": 887117435.0, + "step": 23253 + }, + { + "epoch": 2.958147818343722, + "grad_norm": 1.4253432750701904, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8892630338668823, + "num_tokens": 887159207.0, + "step": 23254 + }, + { + "epoch": 2.958275028622313, + "grad_norm": 1.7212177515029907, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8821799755096436, + "num_tokens": 887192142.0, + "step": 23255 + }, + { + "epoch": 2.958402238900903, + "grad_norm": 1.5641084909439087, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8738007545471191, + "num_tokens": 887232358.0, + "step": 23256 + }, + { + "epoch": 2.9585294491794936, + "grad_norm": 1.583705186843872, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8912663459777832, + "num_tokens": 887268083.0, + "step": 23257 + }, + { + "epoch": 2.958656659458084, + "grad_norm": 1.588309407234192, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8858113288879395, + "num_tokens": 887302971.0, + "step": 23258 + }, + { + "epoch": 2.9587838697366746, + "grad_norm": 1.4700522422790527, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.897299587726593, + "num_tokens": 887344825.0, + "step": 23259 + }, + { + "epoch": 2.958911080015265, + "grad_norm": 1.6786367893218994, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8794604539871216, + "num_tokens": 887381371.0, + "step": 23260 + }, + { + "epoch": 2.9590382902938557, + "grad_norm": 1.4884991645812988, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8876355290412903, + "num_tokens": 887419257.0, + "step": 23261 + }, + { + "epoch": 2.959165500572446, + "grad_norm": 1.5157135725021362, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8873664736747742, + "num_tokens": 887461009.0, + "step": 23262 + }, + { + "epoch": 2.9592927108510367, + "grad_norm": 1.42737877368927, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8880643248558044, + "num_tokens": 887503145.0, + "step": 23263 + }, + { + "epoch": 2.9594199211296273, + "grad_norm": 1.5927584171295166, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8898457288742065, + "num_tokens": 887539398.0, + "step": 23264 + }, + { + "epoch": 2.959547131408218, + "grad_norm": 1.5772053003311157, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8892852663993835, + "num_tokens": 887578819.0, + "step": 23265 + }, + { + "epoch": 2.9596743416868083, + "grad_norm": 1.6950079202651978, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8787305951118469, + "num_tokens": 887621689.0, + "step": 23266 + }, + { + "epoch": 2.959801551965399, + "grad_norm": 1.4650840759277344, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8927176594734192, + "num_tokens": 887660838.0, + "step": 23267 + }, + { + "epoch": 2.9599287622439894, + "grad_norm": 1.631442666053772, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8847232460975647, + "num_tokens": 887694392.0, + "step": 23268 + }, + { + "epoch": 2.96005597252258, + "grad_norm": 1.4077653884887695, + "learning_rate": 1e-06, + "loss": 0.2532, + "mean_token_accuracy": 0.908113420009613, + "num_tokens": 887733540.0, + "step": 23269 + }, + { + "epoch": 2.9601831828011704, + "grad_norm": 1.652002215385437, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8742406368255615, + "num_tokens": 887772011.0, + "step": 23270 + }, + { + "epoch": 2.960310393079761, + "grad_norm": 1.3797063827514648, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8937585353851318, + "num_tokens": 887815934.0, + "step": 23271 + }, + { + "epoch": 2.9604376033583515, + "grad_norm": 1.7138731479644775, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8913047313690186, + "num_tokens": 887848678.0, + "step": 23272 + }, + { + "epoch": 2.960564813636942, + "grad_norm": 1.576000690460205, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8685120344161987, + "num_tokens": 887889173.0, + "step": 23273 + }, + { + "epoch": 2.9606920239155325, + "grad_norm": 1.4834483861923218, + "learning_rate": 1e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.899502158164978, + "num_tokens": 887925349.0, + "step": 23274 + }, + { + "epoch": 2.9608192341941226, + "grad_norm": 1.4599087238311768, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8816680908203125, + "num_tokens": 887966696.0, + "step": 23275 + }, + { + "epoch": 2.9609464444727136, + "grad_norm": 1.5119349956512451, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8714893460273743, + "num_tokens": 888006458.0, + "step": 23276 + }, + { + "epoch": 2.9610736547513037, + "grad_norm": 1.4261740446090698, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8901800513267517, + "num_tokens": 888047875.0, + "step": 23277 + }, + { + "epoch": 2.9612008650298947, + "grad_norm": 1.5030282735824585, + "learning_rate": 1e-06, + "loss": 0.2858, + "mean_token_accuracy": 0.8966109752655029, + "num_tokens": 888086157.0, + "step": 23278 + }, + { + "epoch": 2.9613280753084847, + "grad_norm": 1.2754547595977783, + "learning_rate": 1e-06, + "loss": 0.2729, + "mean_token_accuracy": 0.901163637638092, + "num_tokens": 888131441.0, + "step": 23279 + }, + { + "epoch": 2.9614552855870753, + "grad_norm": 1.4684674739837646, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.878452479839325, + "num_tokens": 888173812.0, + "step": 23280 + }, + { + "epoch": 2.961582495865666, + "grad_norm": 1.5345897674560547, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8857088088989258, + "num_tokens": 888213794.0, + "step": 23281 + }, + { + "epoch": 2.9617097061442563, + "grad_norm": 1.7463213205337524, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8872619271278381, + "num_tokens": 888245033.0, + "step": 23282 + }, + { + "epoch": 2.961836916422847, + "grad_norm": 1.4808053970336914, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8867427110671997, + "num_tokens": 888283781.0, + "step": 23283 + }, + { + "epoch": 2.9619641267014374, + "grad_norm": 1.5400890111923218, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8964808583259583, + "num_tokens": 888318124.0, + "step": 23284 + }, + { + "epoch": 2.962091336980028, + "grad_norm": 1.4850119352340698, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8971370458602905, + "num_tokens": 888355151.0, + "step": 23285 + }, + { + "epoch": 2.9622185472586184, + "grad_norm": 1.431628704071045, + "learning_rate": 1e-06, + "loss": 0.2723, + "mean_token_accuracy": 0.9009125232696533, + "num_tokens": 888396954.0, + "step": 23286 + }, + { + "epoch": 2.962345757537209, + "grad_norm": 1.7580095529556274, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8751770257949829, + "num_tokens": 888435725.0, + "step": 23287 + }, + { + "epoch": 2.9624729678157995, + "grad_norm": 1.45095956325531, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.8971469402313232, + "num_tokens": 888475348.0, + "step": 23288 + }, + { + "epoch": 2.96260017809439, + "grad_norm": 1.5990209579467773, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8943000435829163, + "num_tokens": 888509832.0, + "step": 23289 + }, + { + "epoch": 2.9627273883729806, + "grad_norm": 1.4959461688995361, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8702797293663025, + "num_tokens": 888555031.0, + "step": 23290 + }, + { + "epoch": 2.962854598651571, + "grad_norm": 1.4472036361694336, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8924339413642883, + "num_tokens": 888593983.0, + "step": 23291 + }, + { + "epoch": 2.9629818089301616, + "grad_norm": 1.5673060417175293, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8746309876441956, + "num_tokens": 888632933.0, + "step": 23292 + }, + { + "epoch": 2.963109019208752, + "grad_norm": 1.596174716949463, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8830951452255249, + "num_tokens": 888670676.0, + "step": 23293 + }, + { + "epoch": 2.9632362294873427, + "grad_norm": 1.4286915063858032, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8886421918869019, + "num_tokens": 888711792.0, + "step": 23294 + }, + { + "epoch": 2.963363439765933, + "grad_norm": 1.643966555595398, + "learning_rate": 1e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.8969238996505737, + "num_tokens": 888742156.0, + "step": 23295 + }, + { + "epoch": 2.9634906500445237, + "grad_norm": 1.7811357975006104, + "learning_rate": 1e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.8942991495132446, + "num_tokens": 888771478.0, + "step": 23296 + }, + { + "epoch": 2.9636178603231143, + "grad_norm": 1.4985255002975464, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8901855945587158, + "num_tokens": 888813341.0, + "step": 23297 + }, + { + "epoch": 2.963745070601705, + "grad_norm": 1.6132842302322388, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8779755234718323, + "num_tokens": 888853136.0, + "step": 23298 + }, + { + "epoch": 2.9638722808802953, + "grad_norm": 1.4985437393188477, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8896183967590332, + "num_tokens": 888892331.0, + "step": 23299 + }, + { + "epoch": 2.9639994911588854, + "grad_norm": 1.5240404605865479, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8920907378196716, + "num_tokens": 888931363.0, + "step": 23300 + }, + { + "epoch": 2.9641267014374764, + "grad_norm": 1.619737982749939, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8675582408905029, + "num_tokens": 888967228.0, + "step": 23301 + }, + { + "epoch": 2.9642539117160664, + "grad_norm": 1.5815192461013794, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8868540525436401, + "num_tokens": 889004330.0, + "step": 23302 + }, + { + "epoch": 2.9643811219946574, + "grad_norm": 1.619253158569336, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8769649267196655, + "num_tokens": 889042379.0, + "step": 23303 + }, + { + "epoch": 2.9645083322732475, + "grad_norm": 1.7038077116012573, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8881973028182983, + "num_tokens": 889076376.0, + "step": 23304 + }, + { + "epoch": 2.964635542551838, + "grad_norm": 1.5746580362319946, + "learning_rate": 1e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.8998134136199951, + "num_tokens": 889108797.0, + "step": 23305 + }, + { + "epoch": 2.9647627528304286, + "grad_norm": 1.596506118774414, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8915449380874634, + "num_tokens": 889148266.0, + "step": 23306 + }, + { + "epoch": 2.964889963109019, + "grad_norm": 1.5785984992980957, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8830844163894653, + "num_tokens": 889185310.0, + "step": 23307 + }, + { + "epoch": 2.9650171733876096, + "grad_norm": 1.5197306871414185, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8739246129989624, + "num_tokens": 889228692.0, + "step": 23308 + }, + { + "epoch": 2.9651443836662, + "grad_norm": 1.5693700313568115, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8766248822212219, + "num_tokens": 889266010.0, + "step": 23309 + }, + { + "epoch": 2.9652715939447907, + "grad_norm": 1.5373111963272095, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.890813410282135, + "num_tokens": 889300914.0, + "step": 23310 + }, + { + "epoch": 2.965398804223381, + "grad_norm": 1.495816946029663, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.8972057104110718, + "num_tokens": 889341248.0, + "step": 23311 + }, + { + "epoch": 2.9655260145019717, + "grad_norm": 1.4924774169921875, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8881675601005554, + "num_tokens": 889381244.0, + "step": 23312 + }, + { + "epoch": 2.9656532247805623, + "grad_norm": 1.5514391660690308, + "learning_rate": 1e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.8972494602203369, + "num_tokens": 889418866.0, + "step": 23313 + }, + { + "epoch": 2.965780435059153, + "grad_norm": 1.4864495992660522, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8899099826812744, + "num_tokens": 889458386.0, + "step": 23314 + }, + { + "epoch": 2.9659076453377433, + "grad_norm": 1.5668319463729858, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8750736713409424, + "num_tokens": 889498721.0, + "step": 23315 + }, + { + "epoch": 2.966034855616334, + "grad_norm": 1.638437032699585, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8798031806945801, + "num_tokens": 889533688.0, + "step": 23316 + }, + { + "epoch": 2.9661620658949244, + "grad_norm": 1.488232135772705, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8958404064178467, + "num_tokens": 889577034.0, + "step": 23317 + }, + { + "epoch": 2.966289276173515, + "grad_norm": 1.6116266250610352, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8765848278999329, + "num_tokens": 889615666.0, + "step": 23318 + }, + { + "epoch": 2.9664164864521054, + "grad_norm": 1.5417520999908447, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8890044689178467, + "num_tokens": 889653801.0, + "step": 23319 + }, + { + "epoch": 2.966543696730696, + "grad_norm": 1.6296250820159912, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8810775876045227, + "num_tokens": 889692422.0, + "step": 23320 + }, + { + "epoch": 2.9666709070092865, + "grad_norm": 1.6186579465866089, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8830963969230652, + "num_tokens": 889726130.0, + "step": 23321 + }, + { + "epoch": 2.966798117287877, + "grad_norm": 1.5966691970825195, + "learning_rate": 1e-06, + "loss": 0.273, + "mean_token_accuracy": 0.9018890261650085, + "num_tokens": 889761860.0, + "step": 23322 + }, + { + "epoch": 2.966925327566467, + "grad_norm": 1.4604027271270752, + "learning_rate": 1e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.8931921124458313, + "num_tokens": 889799430.0, + "step": 23323 + }, + { + "epoch": 2.967052537845058, + "grad_norm": 1.4805052280426025, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.8925900459289551, + "num_tokens": 889838353.0, + "step": 23324 + }, + { + "epoch": 2.967179748123648, + "grad_norm": 1.5231940746307373, + "learning_rate": 1e-06, + "loss": 0.2853, + "mean_token_accuracy": 0.895748496055603, + "num_tokens": 889873308.0, + "step": 23325 + }, + { + "epoch": 2.967306958402239, + "grad_norm": 1.6579644680023193, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8863622546195984, + "num_tokens": 889905729.0, + "step": 23326 + }, + { + "epoch": 2.967434168680829, + "grad_norm": 1.4541752338409424, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8916570544242859, + "num_tokens": 889947645.0, + "step": 23327 + }, + { + "epoch": 2.96756137895942, + "grad_norm": 1.531590461730957, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8868410587310791, + "num_tokens": 889984434.0, + "step": 23328 + }, + { + "epoch": 2.9676885892380103, + "grad_norm": 1.5991352796554565, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.9004771113395691, + "num_tokens": 890016893.0, + "step": 23329 + }, + { + "epoch": 2.967815799516601, + "grad_norm": 1.5655988454818726, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.859290599822998, + "num_tokens": 890062050.0, + "step": 23330 + }, + { + "epoch": 2.9679430097951913, + "grad_norm": 1.5683445930480957, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8829449415206909, + "num_tokens": 890100515.0, + "step": 23331 + }, + { + "epoch": 2.968070220073782, + "grad_norm": 1.4194326400756836, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8844588994979858, + "num_tokens": 890140536.0, + "step": 23332 + }, + { + "epoch": 2.9681974303523724, + "grad_norm": 1.6182678937911987, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8793472051620483, + "num_tokens": 890177305.0, + "step": 23333 + }, + { + "epoch": 2.968324640630963, + "grad_norm": 1.536631464958191, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8853256702423096, + "num_tokens": 890216598.0, + "step": 23334 + }, + { + "epoch": 2.9684518509095534, + "grad_norm": 1.50862455368042, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.892504096031189, + "num_tokens": 890253890.0, + "step": 23335 + }, + { + "epoch": 2.968579061188144, + "grad_norm": 1.5045442581176758, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8938364386558533, + "num_tokens": 890291690.0, + "step": 23336 + }, + { + "epoch": 2.9687062714667345, + "grad_norm": 1.5809684991836548, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8871060013771057, + "num_tokens": 890330442.0, + "step": 23337 + }, + { + "epoch": 2.968833481745325, + "grad_norm": 1.6573113203048706, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8885397911071777, + "num_tokens": 890366806.0, + "step": 23338 + }, + { + "epoch": 2.9689606920239155, + "grad_norm": 1.6807516813278198, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.89239901304245, + "num_tokens": 890397734.0, + "step": 23339 + }, + { + "epoch": 2.969087902302506, + "grad_norm": 1.6116809844970703, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8805721998214722, + "num_tokens": 890435697.0, + "step": 23340 + }, + { + "epoch": 2.9692151125810966, + "grad_norm": 1.6335846185684204, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8766327500343323, + "num_tokens": 890472627.0, + "step": 23341 + }, + { + "epoch": 2.969342322859687, + "grad_norm": 1.5930979251861572, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8845587968826294, + "num_tokens": 890510130.0, + "step": 23342 + }, + { + "epoch": 2.9694695331382777, + "grad_norm": 1.488653540611267, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8953443169593811, + "num_tokens": 890549177.0, + "step": 23343 + }, + { + "epoch": 2.969596743416868, + "grad_norm": 1.478468656539917, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8951901793479919, + "num_tokens": 890589671.0, + "step": 23344 + }, + { + "epoch": 2.9697239536954587, + "grad_norm": 1.4834171533584595, + "learning_rate": 1e-06, + "loss": 0.287, + "mean_token_accuracy": 0.8972471952438354, + "num_tokens": 890629841.0, + "step": 23345 + }, + { + "epoch": 2.9698511639740492, + "grad_norm": 1.5212440490722656, + "learning_rate": 1e-06, + "loss": 0.273, + "mean_token_accuracy": 0.9019924402236938, + "num_tokens": 890668279.0, + "step": 23346 + }, + { + "epoch": 2.9699783742526398, + "grad_norm": 1.5755414962768555, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8833534121513367, + "num_tokens": 890705039.0, + "step": 23347 + }, + { + "epoch": 2.97010558453123, + "grad_norm": 1.6218047142028809, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8813546895980835, + "num_tokens": 890738092.0, + "step": 23348 + }, + { + "epoch": 2.970232794809821, + "grad_norm": 1.6185306310653687, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8888144493103027, + "num_tokens": 890773434.0, + "step": 23349 + }, + { + "epoch": 2.970360005088411, + "grad_norm": 1.4901202917099, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8940733075141907, + "num_tokens": 890811661.0, + "step": 23350 + }, + { + "epoch": 2.970487215367002, + "grad_norm": 1.5930722951889038, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8855950236320496, + "num_tokens": 890848263.0, + "step": 23351 + }, + { + "epoch": 2.970614425645592, + "grad_norm": 1.429824709892273, + "learning_rate": 1e-06, + "loss": 0.2774, + "mean_token_accuracy": 0.9011831283569336, + "num_tokens": 890884532.0, + "step": 23352 + }, + { + "epoch": 2.970741635924183, + "grad_norm": 1.541520357131958, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8675332069396973, + "num_tokens": 890928964.0, + "step": 23353 + }, + { + "epoch": 2.970868846202773, + "grad_norm": 1.4808646440505981, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8798625469207764, + "num_tokens": 890969360.0, + "step": 23354 + }, + { + "epoch": 2.9709960564813636, + "grad_norm": 1.6163431406021118, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8872706890106201, + "num_tokens": 891002241.0, + "step": 23355 + }, + { + "epoch": 2.971123266759954, + "grad_norm": 1.413254737854004, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8916114568710327, + "num_tokens": 891042355.0, + "step": 23356 + }, + { + "epoch": 2.9712504770385446, + "grad_norm": 1.7128586769104004, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.868208646774292, + "num_tokens": 891077117.0, + "step": 23357 + }, + { + "epoch": 2.971377687317135, + "grad_norm": 1.4827312231063843, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8865190744400024, + "num_tokens": 891115000.0, + "step": 23358 + }, + { + "epoch": 2.9715048975957257, + "grad_norm": 1.4766318798065186, + "learning_rate": 1e-06, + "loss": 0.2727, + "mean_token_accuracy": 0.9005805253982544, + "num_tokens": 891153596.0, + "step": 23359 + }, + { + "epoch": 2.971632107874316, + "grad_norm": 1.520803689956665, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.894898533821106, + "num_tokens": 891191481.0, + "step": 23360 + }, + { + "epoch": 2.9717593181529067, + "grad_norm": 1.6260184049606323, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8743528127670288, + "num_tokens": 891229801.0, + "step": 23361 + }, + { + "epoch": 2.9718865284314973, + "grad_norm": 1.5782606601715088, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8919603824615479, + "num_tokens": 891268919.0, + "step": 23362 + }, + { + "epoch": 2.972013738710088, + "grad_norm": 1.513948917388916, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.8938442468643188, + "num_tokens": 891307337.0, + "step": 23363 + }, + { + "epoch": 2.9721409489886783, + "grad_norm": 1.496080994606018, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8776285648345947, + "num_tokens": 891347918.0, + "step": 23364 + }, + { + "epoch": 2.972268159267269, + "grad_norm": 1.5073450803756714, + "learning_rate": 1e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.8932679295539856, + "num_tokens": 891385077.0, + "step": 23365 + }, + { + "epoch": 2.9723953695458594, + "grad_norm": 1.5891283750534058, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8719600439071655, + "num_tokens": 891424370.0, + "step": 23366 + }, + { + "epoch": 2.97252257982445, + "grad_norm": 1.6620678901672363, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8654369115829468, + "num_tokens": 891460845.0, + "step": 23367 + }, + { + "epoch": 2.9726497901030404, + "grad_norm": 1.5856751203536987, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8593644499778748, + "num_tokens": 891500999.0, + "step": 23368 + }, + { + "epoch": 2.972777000381631, + "grad_norm": 1.4769175052642822, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.880435585975647, + "num_tokens": 891541834.0, + "step": 23369 + }, + { + "epoch": 2.9729042106602215, + "grad_norm": 1.5621005296707153, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8834424018859863, + "num_tokens": 891581036.0, + "step": 23370 + }, + { + "epoch": 2.973031420938812, + "grad_norm": 1.6217159032821655, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8795208930969238, + "num_tokens": 891616454.0, + "step": 23371 + }, + { + "epoch": 2.9731586312174025, + "grad_norm": 1.610542893409729, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.877608060836792, + "num_tokens": 891654890.0, + "step": 23372 + }, + { + "epoch": 2.9732858414959926, + "grad_norm": 1.5313224792480469, + "learning_rate": 1e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.8967238664627075, + "num_tokens": 891692614.0, + "step": 23373 + }, + { + "epoch": 2.9734130517745836, + "grad_norm": 1.6281640529632568, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8802461624145508, + "num_tokens": 891730230.0, + "step": 23374 + }, + { + "epoch": 2.9735402620531737, + "grad_norm": 1.6008539199829102, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8826899528503418, + "num_tokens": 891765915.0, + "step": 23375 + }, + { + "epoch": 2.9736674723317646, + "grad_norm": 1.5016347169876099, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8881732821464539, + "num_tokens": 891806581.0, + "step": 23376 + }, + { + "epoch": 2.9737946826103547, + "grad_norm": 1.477256417274475, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.884217381477356, + "num_tokens": 891849395.0, + "step": 23377 + }, + { + "epoch": 2.9739218928889453, + "grad_norm": 1.4264600276947021, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8985016345977783, + "num_tokens": 891890110.0, + "step": 23378 + }, + { + "epoch": 2.974049103167536, + "grad_norm": 1.4030852317810059, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8857512474060059, + "num_tokens": 891932452.0, + "step": 23379 + }, + { + "epoch": 2.9741763134461263, + "grad_norm": 1.7462068796157837, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8838566541671753, + "num_tokens": 891962135.0, + "step": 23380 + }, + { + "epoch": 2.974303523724717, + "grad_norm": 1.5694961547851562, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8899581432342529, + "num_tokens": 891996286.0, + "step": 23381 + }, + { + "epoch": 2.9744307340033074, + "grad_norm": 1.516550898551941, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8911511301994324, + "num_tokens": 892032965.0, + "step": 23382 + }, + { + "epoch": 2.974557944281898, + "grad_norm": 1.5389922857284546, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8886831998825073, + "num_tokens": 892070590.0, + "step": 23383 + }, + { + "epoch": 2.9746851545604884, + "grad_norm": 1.5450825691223145, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.894399881362915, + "num_tokens": 892105741.0, + "step": 23384 + }, + { + "epoch": 2.974812364839079, + "grad_norm": 1.731757640838623, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8737462759017944, + "num_tokens": 892141225.0, + "step": 23385 + }, + { + "epoch": 2.9749395751176695, + "grad_norm": 1.59489905834198, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.883032500743866, + "num_tokens": 892174319.0, + "step": 23386 + }, + { + "epoch": 2.97506678539626, + "grad_norm": 1.555891990661621, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8790465593338013, + "num_tokens": 892211311.0, + "step": 23387 + }, + { + "epoch": 2.9751939956748505, + "grad_norm": 1.7037029266357422, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8811166286468506, + "num_tokens": 892240909.0, + "step": 23388 + }, + { + "epoch": 2.975321205953441, + "grad_norm": 1.7836068868637085, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8752769231796265, + "num_tokens": 892271584.0, + "step": 23389 + }, + { + "epoch": 2.9754484162320316, + "grad_norm": 1.5312867164611816, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8927081227302551, + "num_tokens": 892309823.0, + "step": 23390 + }, + { + "epoch": 2.975575626510622, + "grad_norm": 1.6902544498443604, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8813360929489136, + "num_tokens": 892347701.0, + "step": 23391 + }, + { + "epoch": 2.9757028367892127, + "grad_norm": 1.547736644744873, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8774805068969727, + "num_tokens": 892389430.0, + "step": 23392 + }, + { + "epoch": 2.975830047067803, + "grad_norm": 1.5197885036468506, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8947529792785645, + "num_tokens": 892425620.0, + "step": 23393 + }, + { + "epoch": 2.9759572573463937, + "grad_norm": 1.556320309638977, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8893744945526123, + "num_tokens": 892463443.0, + "step": 23394 + }, + { + "epoch": 2.9760844676249842, + "grad_norm": 1.4886479377746582, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8943436741828918, + "num_tokens": 892503441.0, + "step": 23395 + }, + { + "epoch": 2.9762116779035748, + "grad_norm": 1.5768276453018188, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.888323187828064, + "num_tokens": 892539059.0, + "step": 23396 + }, + { + "epoch": 2.9763388881821653, + "grad_norm": 1.6335067749023438, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8911588191986084, + "num_tokens": 892572179.0, + "step": 23397 + }, + { + "epoch": 2.9764660984607554, + "grad_norm": 1.4002048969268799, + "learning_rate": 1e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.8963361978530884, + "num_tokens": 892617682.0, + "step": 23398 + }, + { + "epoch": 2.9765933087393464, + "grad_norm": 1.3903133869171143, + "learning_rate": 1e-06, + "loss": 0.292, + "mean_token_accuracy": 0.8968680500984192, + "num_tokens": 892663338.0, + "step": 23399 + }, + { + "epoch": 2.9767205190179364, + "grad_norm": 1.5363370180130005, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8923205137252808, + "num_tokens": 892698662.0, + "step": 23400 + }, + { + "epoch": 2.9768477292965274, + "grad_norm": 1.483043909072876, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.8967057466506958, + "num_tokens": 892739263.0, + "step": 23401 + }, + { + "epoch": 2.9769749395751175, + "grad_norm": 1.5606368780136108, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8856056928634644, + "num_tokens": 892774986.0, + "step": 23402 + }, + { + "epoch": 2.977102149853708, + "grad_norm": 1.4821765422821045, + "learning_rate": 1e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.8974205851554871, + "num_tokens": 892814807.0, + "step": 23403 + }, + { + "epoch": 2.9772293601322986, + "grad_norm": 1.6470626592636108, + "learning_rate": 1e-06, + "loss": 0.268, + "mean_token_accuracy": 0.9004818201065063, + "num_tokens": 892844349.0, + "step": 23404 + }, + { + "epoch": 2.977356570410889, + "grad_norm": 1.6091017723083496, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8932180404663086, + "num_tokens": 892879771.0, + "step": 23405 + }, + { + "epoch": 2.9774837806894796, + "grad_norm": 1.563313603401184, + "learning_rate": 1e-06, + "loss": 0.2622, + "mean_token_accuracy": 0.9023587703704834, + "num_tokens": 892912160.0, + "step": 23406 + }, + { + "epoch": 2.97761099096807, + "grad_norm": 1.4883531332015991, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.885939359664917, + "num_tokens": 892954221.0, + "step": 23407 + }, + { + "epoch": 2.9777382012466607, + "grad_norm": 1.5782721042633057, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8560941815376282, + "num_tokens": 892995745.0, + "step": 23408 + }, + { + "epoch": 2.977865411525251, + "grad_norm": 1.50215744972229, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.891557514667511, + "num_tokens": 893034826.0, + "step": 23409 + }, + { + "epoch": 2.9779926218038417, + "grad_norm": 1.621850609779358, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8841098546981812, + "num_tokens": 893070126.0, + "step": 23410 + }, + { + "epoch": 2.9781198320824323, + "grad_norm": 1.5040743350982666, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8665213584899902, + "num_tokens": 893113499.0, + "step": 23411 + }, + { + "epoch": 2.978247042361023, + "grad_norm": 1.6256183385849, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8831719756126404, + "num_tokens": 893148574.0, + "step": 23412 + }, + { + "epoch": 2.9783742526396133, + "grad_norm": 1.535155177116394, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8892743587493896, + "num_tokens": 893184790.0, + "step": 23413 + }, + { + "epoch": 2.978501462918204, + "grad_norm": 1.7180660963058472, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8669180870056152, + "num_tokens": 893220244.0, + "step": 23414 + }, + { + "epoch": 2.9786286731967944, + "grad_norm": 1.5446799993515015, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8953585624694824, + "num_tokens": 893255116.0, + "step": 23415 + }, + { + "epoch": 2.978755883475385, + "grad_norm": 1.544066071510315, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8758949637413025, + "num_tokens": 893295067.0, + "step": 23416 + }, + { + "epoch": 2.9788830937539754, + "grad_norm": 1.5971436500549316, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8750525116920471, + "num_tokens": 893336582.0, + "step": 23417 + }, + { + "epoch": 2.979010304032566, + "grad_norm": 1.5874189138412476, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8856625556945801, + "num_tokens": 893375577.0, + "step": 23418 + }, + { + "epoch": 2.9791375143111565, + "grad_norm": 1.526454210281372, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8864455223083496, + "num_tokens": 893414151.0, + "step": 23419 + }, + { + "epoch": 2.979264724589747, + "grad_norm": 1.4552717208862305, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8974653482437134, + "num_tokens": 893452565.0, + "step": 23420 + }, + { + "epoch": 2.979391934868337, + "grad_norm": 1.4655303955078125, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8824931383132935, + "num_tokens": 893499442.0, + "step": 23421 + }, + { + "epoch": 2.979519145146928, + "grad_norm": 1.6745704412460327, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8845517635345459, + "num_tokens": 893533189.0, + "step": 23422 + }, + { + "epoch": 2.979646355425518, + "grad_norm": 1.4896981716156006, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8925230503082275, + "num_tokens": 893572472.0, + "step": 23423 + }, + { + "epoch": 2.979773565704109, + "grad_norm": 1.4608426094055176, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8944663405418396, + "num_tokens": 893613452.0, + "step": 23424 + }, + { + "epoch": 2.979900775982699, + "grad_norm": 1.546857476234436, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8829638361930847, + "num_tokens": 893652299.0, + "step": 23425 + }, + { + "epoch": 2.98002798626129, + "grad_norm": 1.6828449964523315, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8730378746986389, + "num_tokens": 893687930.0, + "step": 23426 + }, + { + "epoch": 2.9801551965398803, + "grad_norm": 1.5786967277526855, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8798682689666748, + "num_tokens": 893721544.0, + "step": 23427 + }, + { + "epoch": 2.980282406818471, + "grad_norm": 1.734872579574585, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8868632316589355, + "num_tokens": 893757723.0, + "step": 23428 + }, + { + "epoch": 2.9804096170970613, + "grad_norm": 1.5855624675750732, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8835235834121704, + "num_tokens": 893794306.0, + "step": 23429 + }, + { + "epoch": 2.980536827375652, + "grad_norm": 1.7018898725509644, + "learning_rate": 1e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.8987531661987305, + "num_tokens": 893825226.0, + "step": 23430 + }, + { + "epoch": 2.9806640376542424, + "grad_norm": 1.6748853921890259, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8853293657302856, + "num_tokens": 893859103.0, + "step": 23431 + }, + { + "epoch": 2.980791247932833, + "grad_norm": 1.492757797241211, + "learning_rate": 1e-06, + "loss": 0.2455, + "mean_token_accuracy": 0.9113726615905762, + "num_tokens": 893893433.0, + "step": 23432 + }, + { + "epoch": 2.9809184582114234, + "grad_norm": 1.7088121175765991, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8866178393363953, + "num_tokens": 893928179.0, + "step": 23433 + }, + { + "epoch": 2.981045668490014, + "grad_norm": 1.7256665229797363, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8834090232849121, + "num_tokens": 893963482.0, + "step": 23434 + }, + { + "epoch": 2.9811728787686045, + "grad_norm": 1.480299711227417, + "learning_rate": 1e-06, + "loss": 0.2427, + "mean_token_accuracy": 0.9093540906906128, + "num_tokens": 893995771.0, + "step": 23435 + }, + { + "epoch": 2.981300089047195, + "grad_norm": 1.7195496559143066, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8838441371917725, + "num_tokens": 894029713.0, + "step": 23436 + }, + { + "epoch": 2.9814272993257855, + "grad_norm": 1.360032558441162, + "learning_rate": 1e-06, + "loss": 0.2783, + "mean_token_accuracy": 0.8987383842468262, + "num_tokens": 894069956.0, + "step": 23437 + }, + { + "epoch": 2.981554509604376, + "grad_norm": 1.6361443996429443, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8889648914337158, + "num_tokens": 894105455.0, + "step": 23438 + }, + { + "epoch": 2.9816817198829666, + "grad_norm": 1.7256007194519043, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8874197006225586, + "num_tokens": 894136047.0, + "step": 23439 + }, + { + "epoch": 2.981808930161557, + "grad_norm": 1.5034130811691284, + "learning_rate": 1e-06, + "loss": 0.2798, + "mean_token_accuracy": 0.9001114964485168, + "num_tokens": 894174724.0, + "step": 23440 + }, + { + "epoch": 2.9819361404401477, + "grad_norm": 1.6418828964233398, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8811821341514587, + "num_tokens": 894208640.0, + "step": 23441 + }, + { + "epoch": 2.982063350718738, + "grad_norm": 1.7340867519378662, + "learning_rate": 1e-06, + "loss": 0.2737, + "mean_token_accuracy": 0.9004311561584473, + "num_tokens": 894239794.0, + "step": 23442 + }, + { + "epoch": 2.9821905609973287, + "grad_norm": 1.688496470451355, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8710801005363464, + "num_tokens": 894271652.0, + "step": 23443 + }, + { + "epoch": 2.9823177712759192, + "grad_norm": 1.4606109857559204, + "learning_rate": 1e-06, + "loss": 0.2833, + "mean_token_accuracy": 0.8978805541992188, + "num_tokens": 894309273.0, + "step": 23444 + }, + { + "epoch": 2.9824449815545098, + "grad_norm": 1.6485323905944824, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8858410716056824, + "num_tokens": 894343617.0, + "step": 23445 + }, + { + "epoch": 2.9825721918331, + "grad_norm": 1.5748142004013062, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.890274167060852, + "num_tokens": 894378754.0, + "step": 23446 + }, + { + "epoch": 2.982699402111691, + "grad_norm": 1.4854260683059692, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8854219913482666, + "num_tokens": 894419358.0, + "step": 23447 + }, + { + "epoch": 2.982826612390281, + "grad_norm": 1.5795528888702393, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8824570178985596, + "num_tokens": 894457314.0, + "step": 23448 + }, + { + "epoch": 2.982953822668872, + "grad_norm": 1.466049313545227, + "learning_rate": 1e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.8959851861000061, + "num_tokens": 894499105.0, + "step": 23449 + }, + { + "epoch": 2.983081032947462, + "grad_norm": 1.379217267036438, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.881890058517456, + "num_tokens": 894543218.0, + "step": 23450 + }, + { + "epoch": 2.983208243226053, + "grad_norm": 1.5182480812072754, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8778609037399292, + "num_tokens": 894582947.0, + "step": 23451 + }, + { + "epoch": 2.983335453504643, + "grad_norm": 1.4662131071090698, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8914000988006592, + "num_tokens": 894625398.0, + "step": 23452 + }, + { + "epoch": 2.9834626637832335, + "grad_norm": 1.4900705814361572, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8951928615570068, + "num_tokens": 894662578.0, + "step": 23453 + }, + { + "epoch": 2.983589874061824, + "grad_norm": 1.4998784065246582, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.885942816734314, + "num_tokens": 894703154.0, + "step": 23454 + }, + { + "epoch": 2.9837170843404146, + "grad_norm": 1.707334280014038, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.885791540145874, + "num_tokens": 894736684.0, + "step": 23455 + }, + { + "epoch": 2.983844294619005, + "grad_norm": 1.4977566003799438, + "learning_rate": 1e-06, + "loss": 0.2833, + "mean_token_accuracy": 0.8983660936355591, + "num_tokens": 894773337.0, + "step": 23456 + }, + { + "epoch": 2.9839715048975957, + "grad_norm": 1.6483815908432007, + "learning_rate": 1e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.8992617130279541, + "num_tokens": 894804911.0, + "step": 23457 + }, + { + "epoch": 2.984098715176186, + "grad_norm": 1.5159183740615845, + "learning_rate": 1e-06, + "loss": 0.2632, + "mean_token_accuracy": 0.9023208618164062, + "num_tokens": 894840056.0, + "step": 23458 + }, + { + "epoch": 2.9842259254547767, + "grad_norm": 1.4929922819137573, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8837913870811462, + "num_tokens": 894881381.0, + "step": 23459 + }, + { + "epoch": 2.9843531357333672, + "grad_norm": 1.5021275281906128, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8774739503860474, + "num_tokens": 894922806.0, + "step": 23460 + }, + { + "epoch": 2.9844803460119578, + "grad_norm": 1.6164332628250122, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8787928819656372, + "num_tokens": 894960983.0, + "step": 23461 + }, + { + "epoch": 2.9846075562905483, + "grad_norm": 1.526037573814392, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8737627863883972, + "num_tokens": 895004842.0, + "step": 23462 + }, + { + "epoch": 2.984734766569139, + "grad_norm": 1.5592859983444214, + "learning_rate": 1e-06, + "loss": 0.2558, + "mean_token_accuracy": 0.9067948460578918, + "num_tokens": 895034602.0, + "step": 23463 + }, + { + "epoch": 2.9848619768477294, + "grad_norm": 1.490759253501892, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8815207481384277, + "num_tokens": 895076912.0, + "step": 23464 + }, + { + "epoch": 2.98498918712632, + "grad_norm": 1.5315459966659546, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8899827003479004, + "num_tokens": 895116446.0, + "step": 23465 + }, + { + "epoch": 2.9851163974049104, + "grad_norm": 1.537976861000061, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8764775395393372, + "num_tokens": 895156268.0, + "step": 23466 + }, + { + "epoch": 2.985243607683501, + "grad_norm": 1.5292065143585205, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.885056734085083, + "num_tokens": 895194255.0, + "step": 23467 + }, + { + "epoch": 2.9853708179620915, + "grad_norm": 1.5919392108917236, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8781336545944214, + "num_tokens": 895232723.0, + "step": 23468 + }, + { + "epoch": 2.985498028240682, + "grad_norm": 1.5301539897918701, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8852119445800781, + "num_tokens": 895272475.0, + "step": 23469 + }, + { + "epoch": 2.9856252385192725, + "grad_norm": 1.453630805015564, + "learning_rate": 1e-06, + "loss": 0.2902, + "mean_token_accuracy": 0.8950598835945129, + "num_tokens": 895315231.0, + "step": 23470 + }, + { + "epoch": 2.9857524487978626, + "grad_norm": 1.5715876817703247, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8931331634521484, + "num_tokens": 895350963.0, + "step": 23471 + }, + { + "epoch": 2.9858796590764536, + "grad_norm": 1.503340721130371, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8880511522293091, + "num_tokens": 895388499.0, + "step": 23472 + }, + { + "epoch": 2.9860068693550437, + "grad_norm": 1.4868035316467285, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8849587440490723, + "num_tokens": 895429357.0, + "step": 23473 + }, + { + "epoch": 2.9861340796336346, + "grad_norm": 1.4815465211868286, + "learning_rate": 1e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.8943204879760742, + "num_tokens": 895465414.0, + "step": 23474 + }, + { + "epoch": 2.9862612899122247, + "grad_norm": 1.5911457538604736, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.88872230052948, + "num_tokens": 895500960.0, + "step": 23475 + }, + { + "epoch": 2.9863885001908153, + "grad_norm": 1.6084495782852173, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8890156149864197, + "num_tokens": 895536126.0, + "step": 23476 + }, + { + "epoch": 2.986515710469406, + "grad_norm": 1.4221892356872559, + "learning_rate": 1e-06, + "loss": 0.2529, + "mean_token_accuracy": 0.9088249802589417, + "num_tokens": 895572905.0, + "step": 23477 + }, + { + "epoch": 2.9866429207479963, + "grad_norm": 1.626102328300476, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8799921870231628, + "num_tokens": 895608850.0, + "step": 23478 + }, + { + "epoch": 2.986770131026587, + "grad_norm": 1.551474690437317, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8770720362663269, + "num_tokens": 895652107.0, + "step": 23479 + }, + { + "epoch": 2.9868973413051774, + "grad_norm": 1.4851051568984985, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8946821689605713, + "num_tokens": 895691251.0, + "step": 23480 + }, + { + "epoch": 2.987024551583768, + "grad_norm": 1.4634959697723389, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.891488254070282, + "num_tokens": 895731732.0, + "step": 23481 + }, + { + "epoch": 2.9871517618623584, + "grad_norm": 1.6090145111083984, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8879250884056091, + "num_tokens": 895768205.0, + "step": 23482 + }, + { + "epoch": 2.987278972140949, + "grad_norm": 1.6906909942626953, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8849217891693115, + "num_tokens": 895798916.0, + "step": 23483 + }, + { + "epoch": 2.9874061824195395, + "grad_norm": 1.4517019987106323, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8865965604782104, + "num_tokens": 895843127.0, + "step": 23484 + }, + { + "epoch": 2.98753339269813, + "grad_norm": 1.587040662765503, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8768920302391052, + "num_tokens": 895884133.0, + "step": 23485 + }, + { + "epoch": 2.9876606029767205, + "grad_norm": 1.5143178701400757, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8818315267562866, + "num_tokens": 895922814.0, + "step": 23486 + }, + { + "epoch": 2.987787813255311, + "grad_norm": 1.5358476638793945, + "learning_rate": 1e-06, + "loss": 0.2836, + "mean_token_accuracy": 0.8976461887359619, + "num_tokens": 895955052.0, + "step": 23487 + }, + { + "epoch": 2.9879150235339016, + "grad_norm": 1.4719592332839966, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8873546123504639, + "num_tokens": 895997671.0, + "step": 23488 + }, + { + "epoch": 2.988042233812492, + "grad_norm": 1.4735196828842163, + "learning_rate": 1e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.9001935720443726, + "num_tokens": 896036269.0, + "step": 23489 + }, + { + "epoch": 2.9881694440910826, + "grad_norm": 1.4522500038146973, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8874024152755737, + "num_tokens": 896079018.0, + "step": 23490 + }, + { + "epoch": 2.988296654369673, + "grad_norm": 1.6472402811050415, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8851985931396484, + "num_tokens": 896114202.0, + "step": 23491 + }, + { + "epoch": 2.9884238646482637, + "grad_norm": 1.4985291957855225, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8880096673965454, + "num_tokens": 896156122.0, + "step": 23492 + }, + { + "epoch": 2.9885510749268542, + "grad_norm": 1.4862288236618042, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8926438093185425, + "num_tokens": 896198516.0, + "step": 23493 + }, + { + "epoch": 2.9886782852054448, + "grad_norm": 1.587991714477539, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8906490802764893, + "num_tokens": 896234752.0, + "step": 23494 + }, + { + "epoch": 2.9888054954840353, + "grad_norm": 1.5570788383483887, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.894067645072937, + "num_tokens": 896269731.0, + "step": 23495 + }, + { + "epoch": 2.9889327057626254, + "grad_norm": 1.5519541501998901, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8852313756942749, + "num_tokens": 896309029.0, + "step": 23496 + }, + { + "epoch": 2.9890599160412163, + "grad_norm": 1.5523713827133179, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8688842058181763, + "num_tokens": 896348565.0, + "step": 23497 + }, + { + "epoch": 2.9891871263198064, + "grad_norm": 1.550750494003296, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8715881109237671, + "num_tokens": 896393526.0, + "step": 23498 + }, + { + "epoch": 2.9893143365983974, + "grad_norm": 1.6105470657348633, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8901166915893555, + "num_tokens": 896429518.0, + "step": 23499 + }, + { + "epoch": 2.9894415468769875, + "grad_norm": 1.5467251539230347, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8860814571380615, + "num_tokens": 896467538.0, + "step": 23500 + }, + { + "epoch": 2.989568757155578, + "grad_norm": 1.5411404371261597, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8879286646842957, + "num_tokens": 896504788.0, + "step": 23501 + }, + { + "epoch": 2.9896959674341685, + "grad_norm": 1.6205450296401978, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8803239464759827, + "num_tokens": 896540391.0, + "step": 23502 + }, + { + "epoch": 2.989823177712759, + "grad_norm": 1.5258822441101074, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8865286111831665, + "num_tokens": 896578925.0, + "step": 23503 + }, + { + "epoch": 2.9899503879913496, + "grad_norm": 1.7000373601913452, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8757966756820679, + "num_tokens": 896614459.0, + "step": 23504 + }, + { + "epoch": 2.99007759826994, + "grad_norm": 1.4208159446716309, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8917869925498962, + "num_tokens": 896657330.0, + "step": 23505 + }, + { + "epoch": 2.9902048085485307, + "grad_norm": 1.4012333154678345, + "learning_rate": 1e-06, + "loss": 0.284, + "mean_token_accuracy": 0.8949599862098694, + "num_tokens": 896699379.0, + "step": 23506 + }, + { + "epoch": 2.990332018827121, + "grad_norm": 1.550294280052185, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8850035667419434, + "num_tokens": 896738670.0, + "step": 23507 + }, + { + "epoch": 2.9904592291057117, + "grad_norm": 1.5300432443618774, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8804282546043396, + "num_tokens": 896779103.0, + "step": 23508 + }, + { + "epoch": 2.9905864393843022, + "grad_norm": 1.4375834465026855, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8853636980056763, + "num_tokens": 896819682.0, + "step": 23509 + }, + { + "epoch": 2.9907136496628928, + "grad_norm": 1.451690673828125, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8811517357826233, + "num_tokens": 896861805.0, + "step": 23510 + }, + { + "epoch": 2.9908408599414833, + "grad_norm": 1.5788397789001465, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8865780234336853, + "num_tokens": 896900384.0, + "step": 23511 + }, + { + "epoch": 2.990968070220074, + "grad_norm": 1.5261894464492798, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8957419395446777, + "num_tokens": 896937903.0, + "step": 23512 + }, + { + "epoch": 2.9910952804986644, + "grad_norm": 1.5993434190750122, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8803578019142151, + "num_tokens": 896973905.0, + "step": 23513 + }, + { + "epoch": 2.991222490777255, + "grad_norm": 1.5343295335769653, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8818246126174927, + "num_tokens": 897012192.0, + "step": 23514 + }, + { + "epoch": 2.9913497010558454, + "grad_norm": 1.4543392658233643, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8824825286865234, + "num_tokens": 897055514.0, + "step": 23515 + }, + { + "epoch": 2.991476911334436, + "grad_norm": 1.557327151298523, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8892253637313843, + "num_tokens": 897091461.0, + "step": 23516 + }, + { + "epoch": 2.9916041216130265, + "grad_norm": 1.4899652004241943, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8931606411933899, + "num_tokens": 897129781.0, + "step": 23517 + }, + { + "epoch": 2.991731331891617, + "grad_norm": 1.590927004814148, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8833773136138916, + "num_tokens": 897167842.0, + "step": 23518 + }, + { + "epoch": 2.991858542170207, + "grad_norm": 1.4990636110305786, + "learning_rate": 1e-06, + "loss": 0.2562, + "mean_token_accuracy": 0.905745267868042, + "num_tokens": 897202109.0, + "step": 23519 + }, + { + "epoch": 2.991985752448798, + "grad_norm": 1.7257235050201416, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8840272426605225, + "num_tokens": 897232504.0, + "step": 23520 + }, + { + "epoch": 2.992112962727388, + "grad_norm": 1.4841045141220093, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.8921857476234436, + "num_tokens": 897268494.0, + "step": 23521 + }, + { + "epoch": 2.992240173005979, + "grad_norm": 1.7646880149841309, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8874890804290771, + "num_tokens": 897298888.0, + "step": 23522 + }, + { + "epoch": 2.992367383284569, + "grad_norm": 1.3937398195266724, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8809531927108765, + "num_tokens": 897348225.0, + "step": 23523 + }, + { + "epoch": 2.99249459356316, + "grad_norm": 1.494096040725708, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.895907998085022, + "num_tokens": 897387030.0, + "step": 23524 + }, + { + "epoch": 2.9926218038417502, + "grad_norm": 1.6033138036727905, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8664974570274353, + "num_tokens": 897425775.0, + "step": 23525 + }, + { + "epoch": 2.992749014120341, + "grad_norm": 1.7517260313034058, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.886845588684082, + "num_tokens": 897455552.0, + "step": 23526 + }, + { + "epoch": 2.9928762243989313, + "grad_norm": 1.6242212057113647, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8905993103981018, + "num_tokens": 897494176.0, + "step": 23527 + }, + { + "epoch": 2.993003434677522, + "grad_norm": 1.4085891246795654, + "learning_rate": 1e-06, + "loss": 0.2758, + "mean_token_accuracy": 0.8992573618888855, + "num_tokens": 897532424.0, + "step": 23528 + }, + { + "epoch": 2.9931306449561124, + "grad_norm": 1.5754196643829346, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8961718082427979, + "num_tokens": 897568731.0, + "step": 23529 + }, + { + "epoch": 2.993257855234703, + "grad_norm": 1.626299262046814, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8802120685577393, + "num_tokens": 897608587.0, + "step": 23530 + }, + { + "epoch": 2.9933850655132934, + "grad_norm": 1.5732909440994263, + "learning_rate": 1e-06, + "loss": 0.2698, + "mean_token_accuracy": 0.9010188579559326, + "num_tokens": 897644497.0, + "step": 23531 + }, + { + "epoch": 2.993512275791884, + "grad_norm": 1.5467787981033325, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8889632821083069, + "num_tokens": 897680583.0, + "step": 23532 + }, + { + "epoch": 2.9936394860704745, + "grad_norm": 1.4829072952270508, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8961248993873596, + "num_tokens": 897718657.0, + "step": 23533 + }, + { + "epoch": 2.993766696349065, + "grad_norm": 1.596808671951294, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.885239839553833, + "num_tokens": 897752789.0, + "step": 23534 + }, + { + "epoch": 2.9938939066276555, + "grad_norm": 1.44900643825531, + "learning_rate": 1e-06, + "loss": 0.2634, + "mean_token_accuracy": 0.9048616290092468, + "num_tokens": 897791254.0, + "step": 23535 + }, + { + "epoch": 2.994021116906246, + "grad_norm": 1.4776631593704224, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8832725286483765, + "num_tokens": 897830905.0, + "step": 23536 + }, + { + "epoch": 2.9941483271848366, + "grad_norm": 1.5439565181732178, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.893163800239563, + "num_tokens": 897870241.0, + "step": 23537 + }, + { + "epoch": 2.994275537463427, + "grad_norm": 1.5485668182373047, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8834962844848633, + "num_tokens": 897908775.0, + "step": 23538 + }, + { + "epoch": 2.9944027477420176, + "grad_norm": 1.6672416925430298, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.878601610660553, + "num_tokens": 897949799.0, + "step": 23539 + }, + { + "epoch": 2.994529958020608, + "grad_norm": 1.4543341398239136, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8826221227645874, + "num_tokens": 897992798.0, + "step": 23540 + }, + { + "epoch": 2.9946571682991987, + "grad_norm": 1.5686150789260864, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8832091093063354, + "num_tokens": 898031668.0, + "step": 23541 + }, + { + "epoch": 2.9947843785777892, + "grad_norm": 1.3691688776016235, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.880950391292572, + "num_tokens": 898079828.0, + "step": 23542 + }, + { + "epoch": 2.9949115888563798, + "grad_norm": 1.5225441455841064, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8892319202423096, + "num_tokens": 898116808.0, + "step": 23543 + }, + { + "epoch": 2.99503879913497, + "grad_norm": 1.3937721252441406, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8883492350578308, + "num_tokens": 898161890.0, + "step": 23544 + }, + { + "epoch": 2.995166009413561, + "grad_norm": 1.5650300979614258, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8814287185668945, + "num_tokens": 898198992.0, + "step": 23545 + }, + { + "epoch": 2.995293219692151, + "grad_norm": 1.5025651454925537, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.886925220489502, + "num_tokens": 898240714.0, + "step": 23546 + }, + { + "epoch": 2.995420429970742, + "grad_norm": 1.5439753532409668, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8805780410766602, + "num_tokens": 898281800.0, + "step": 23547 + }, + { + "epoch": 2.995547640249332, + "grad_norm": 1.5862714052200317, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8726074695587158, + "num_tokens": 898321373.0, + "step": 23548 + }, + { + "epoch": 2.995674850527923, + "grad_norm": 1.4324153661727905, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.892230749130249, + "num_tokens": 898361778.0, + "step": 23549 + }, + { + "epoch": 2.995802060806513, + "grad_norm": 1.6387295722961426, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8908286094665527, + "num_tokens": 898396146.0, + "step": 23550 + }, + { + "epoch": 2.9959292710851035, + "grad_norm": 1.5604724884033203, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.871229887008667, + "num_tokens": 898437273.0, + "step": 23551 + }, + { + "epoch": 2.996056481363694, + "grad_norm": 1.3848260641098022, + "learning_rate": 1e-06, + "loss": 0.2672, + "mean_token_accuracy": 0.9038918614387512, + "num_tokens": 898478915.0, + "step": 23552 + }, + { + "epoch": 2.9961836916422846, + "grad_norm": 1.461316704750061, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.888025164604187, + "num_tokens": 898517552.0, + "step": 23553 + }, + { + "epoch": 2.996310901920875, + "grad_norm": 1.4376922845840454, + "learning_rate": 1e-06, + "loss": 0.276, + "mean_token_accuracy": 0.9009807109832764, + "num_tokens": 898557593.0, + "step": 23554 + }, + { + "epoch": 2.9964381121994657, + "grad_norm": 1.4527602195739746, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8807590007781982, + "num_tokens": 898599579.0, + "step": 23555 + }, + { + "epoch": 2.996565322478056, + "grad_norm": 1.5877621173858643, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8892213106155396, + "num_tokens": 898638451.0, + "step": 23556 + }, + { + "epoch": 2.9966925327566467, + "grad_norm": 1.482723355293274, + "learning_rate": 1e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.896487295627594, + "num_tokens": 898677472.0, + "step": 23557 + }, + { + "epoch": 2.9968197430352372, + "grad_norm": 1.5714287757873535, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8911517858505249, + "num_tokens": 898713318.0, + "step": 23558 + }, + { + "epoch": 2.9969469533138278, + "grad_norm": 1.5119096040725708, + "learning_rate": 1e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.8952041864395142, + "num_tokens": 898750817.0, + "step": 23559 + }, + { + "epoch": 2.9970741635924183, + "grad_norm": 1.6149110794067383, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.874298095703125, + "num_tokens": 898789749.0, + "step": 23560 + }, + { + "epoch": 2.997201373871009, + "grad_norm": 1.467160940170288, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8849384784698486, + "num_tokens": 898829893.0, + "step": 23561 + }, + { + "epoch": 2.9973285841495994, + "grad_norm": 1.6637520790100098, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8806881308555603, + "num_tokens": 898865119.0, + "step": 23562 + }, + { + "epoch": 2.99745579442819, + "grad_norm": 1.7212556600570679, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.874259889125824, + "num_tokens": 898902611.0, + "step": 23563 + }, + { + "epoch": 2.9975830047067804, + "grad_norm": 1.5480096340179443, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8831924796104431, + "num_tokens": 898940496.0, + "step": 23564 + }, + { + "epoch": 2.997710214985371, + "grad_norm": 1.4979180097579956, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8747411370277405, + "num_tokens": 898980142.0, + "step": 23565 + }, + { + "epoch": 2.9978374252639615, + "grad_norm": 1.5793782472610474, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8814982175827026, + "num_tokens": 899015594.0, + "step": 23566 + }, + { + "epoch": 2.997964635542552, + "grad_norm": 1.5508172512054443, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8952394723892212, + "num_tokens": 899050220.0, + "step": 23567 + }, + { + "epoch": 2.9980918458211425, + "grad_norm": 1.5980157852172852, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8826671242713928, + "num_tokens": 899090583.0, + "step": 23568 + }, + { + "epoch": 2.9982190560997326, + "grad_norm": 1.5970947742462158, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8892910480499268, + "num_tokens": 899128428.0, + "step": 23569 + }, + { + "epoch": 2.9983462663783236, + "grad_norm": 1.501038908958435, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8930602073669434, + "num_tokens": 899169376.0, + "step": 23570 + }, + { + "epoch": 2.9984734766569137, + "grad_norm": 1.5181466341018677, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.892719030380249, + "num_tokens": 899207292.0, + "step": 23571 + }, + { + "epoch": 2.9986006869355046, + "grad_norm": 1.6130671501159668, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8734584450721741, + "num_tokens": 899246381.0, + "step": 23572 + }, + { + "epoch": 2.9987278972140947, + "grad_norm": 1.6191591024398804, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8930662870407104, + "num_tokens": 899285395.0, + "step": 23573 + }, + { + "epoch": 2.9988551074926852, + "grad_norm": 1.530009150505066, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8854090571403503, + "num_tokens": 899326740.0, + "step": 23574 + }, + { + "epoch": 2.9989823177712758, + "grad_norm": 1.4344291687011719, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8878250122070312, + "num_tokens": 899370037.0, + "step": 23575 + }, + { + "epoch": 2.9991095280498663, + "grad_norm": 1.6003845930099487, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8837262392044067, + "num_tokens": 899407970.0, + "step": 23576 + }, + { + "epoch": 2.999236738328457, + "grad_norm": 1.6537954807281494, + "learning_rate": 1e-06, + "loss": 0.2828, + "mean_token_accuracy": 0.8980998992919922, + "num_tokens": 899440035.0, + "step": 23577 + }, + { + "epoch": 2.9993639486070474, + "grad_norm": 1.462884545326233, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8832272291183472, + "num_tokens": 899479622.0, + "step": 23578 + }, + { + "epoch": 2.999491158885638, + "grad_norm": 1.4407830238342285, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8931890726089478, + "num_tokens": 899521333.0, + "step": 23579 + }, + { + "epoch": 2.9996183691642284, + "grad_norm": 1.8896805047988892, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8776206374168396, + "num_tokens": 899554791.0, + "step": 23580 + }, + { + "epoch": 2.999745579442819, + "grad_norm": 1.5613040924072266, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.8915690779685974, + "num_tokens": 899589234.0, + "step": 23581 + }, + { + "epoch": 2.9998727897214095, + "grad_norm": 1.5120630264282227, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8883060812950134, + "num_tokens": 899627746.0, + "step": 23582 + }, + { + "epoch": 3.0, + "grad_norm": 1.6567201614379883, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8838804364204407, + "num_tokens": 899663974.0, + "step": 23583 + }, + { + "epoch": 3.0, + "step": 23583, + "total_flos": 5.628159003328302e+19, + "train_loss": 0.3662048727583553, + "train_runtime": 42474.8118, + "train_samples_per_second": 8.883, + "train_steps_per_second": 0.555 + } + ], + "logging_steps": 1, + "max_steps": 23583, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 11792, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.628159003328302e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..8dd50da --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39810510fc115f5c99ff6344869fc802ecea968125833a69dc5253b691305aae +size 13329